1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-07-04 02:53:06 +00:00

Code cleaning

This commit is contained in:
Carles Fernandez 2014-11-09 13:17:18 +01:00
parent f7f495175e
commit 51427f046a
27 changed files with 6822 additions and 6771 deletions

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that performs the carrier wipe-off mixing and the * Volk protokernel that performs the carrier wipe-off mixing and the
@ -30,7 +30,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -56,8 +56,8 @@
#include <smmintrin.h> #include <smmintrin.h>
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" #include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h" #include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation /*! \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input \param input The input signal input
\param carrier The carrier signal input \param carrier The carrier signal input
\param E_code Early PRN code replica input \param E_code Early PRN code replica input
@ -67,7 +67,7 @@
\param P_out Early correlation output \param P_out Early correlation output
\param L_out Early correlation output \param L_out Early correlation output
\param num_points The number of complex values in vectors \param num_points The number of complex values in vectors
*/ */
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
{ {
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
@ -110,99 +110,98 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t*
imag_L_code_acc = _mm_setzero_ps(); imag_L_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(unsigned int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_lddqu_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_lddqu_si128((__m128i*)input_ptr);
y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
//Adds the float 32 results
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{ {
E_out_real += real_E_dotProductVector[i]; for(unsigned int number = 0;number < sse_iters; number++)
E_out_imag += imag_E_dotProductVector[i]; {
P_out_real += real_P_dotProductVector[i]; //Perform the carrier wipe-off
P_out_imag += imag_P_dotProductVector[i]; x1 = _mm_lddqu_si128((__m128i*)input_ptr);
L_out_real += real_L_dotProductVector[i]; input_ptr += 4;
L_out_imag += imag_L_dotProductVector[i]; x2 = _mm_lddqu_si128((__m128i*)input_ptr);
y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
//Adds the float 32 results
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
}
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
} }
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
}
lv_16sc_t bb_signal_sample; lv_16sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points%8; ++i) for(unsigned int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -234,21 +233,22 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E
// perform Early, Prompt and Late correlation // perform Early, Prompt and Late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * E_code[i]; tmp1 = bb_signal_sample * E_code[i];
tmp2 = bb_signal_sample * P_code[i]; tmp2 = bb_signal_sample * P_code[i];
tmp3 = bb_signal_sample * L_code[i]; tmp3 = bb_signal_sample * L_code[i];
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out += (lv_32fc_t)tmp1; *E_out += (lv_32fc_t)tmp1;
*P_out += (lv_32fc_t)tmp2; *P_out += (lv_32fc_t)tmp2;
*L_out += (lv_32fc_t)tmp3; *L_out += (lv_32fc_t)tmp3;
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */
@ -319,99 +319,98 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t*
imag_L_code_acc = _mm_setzero_ps(); imag_L_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(unsigned int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_load_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_load_si128((__m128i*)input_ptr);
y1 = _mm_load_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_load_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y1 = _mm_load_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
//Adds the float 32 results
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_load_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_load_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{ {
E_out_real += real_E_dotProductVector[i]; for(unsigned int number = 0;number < sse_iters; number++)
E_out_imag += imag_E_dotProductVector[i]; {
P_out_real += real_P_dotProductVector[i]; //Perform the carrier wipe-off
P_out_imag += imag_P_dotProductVector[i]; x1 = _mm_load_si128((__m128i*)input_ptr);
L_out_real += real_L_dotProductVector[i]; input_ptr += 4;
L_out_imag += imag_L_dotProductVector[i]; x2 = _mm_load_si128((__m128i*)input_ptr);
y1 = _mm_load_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_load_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y1 = _mm_load_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
//Adds the float 32 results
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_load_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_load_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
}
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
} }
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
}
lv_16sc_t bb_signal_sample; lv_16sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points%8; ++i) for(unsigned int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -443,19 +442,20 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t*
// perform Early, Prompt and Late correlation // perform Early, Prompt and Late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * E_code[i]; tmp1 = bb_signal_sample * E_code[i];
tmp2 = bb_signal_sample * P_code[i]; tmp2 = bb_signal_sample * P_code[i];
tmp3 = bb_signal_sample * L_code[i]; tmp3 = bb_signal_sample * L_code[i];
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out += (lv_32fc_t)tmp1; *E_out += (lv_32fc_t)tmp1;
*P_out += (lv_32fc_t)tmp2; *P_out += (lv_32fc_t)tmp2;
*L_out += (lv_32fc_t)tmp3; *L_out += (lv_32fc_t)tmp3;
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values. * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values.
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that performs the carrier wipe-off mixing and the * Volk protokernel that performs the carrier wipe-off mixing and the
@ -34,7 +34,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_32fc_convert_16ic.h * \file volk_gnsssdr_32fc_convert_16ic.h
* \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
@ -17,7 +17,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -61,29 +61,30 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip // Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(unsigned int i = 0; i < (num_points%4)*2; i++){ for(unsigned int i = 0; i < (num_points%4)*2; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -111,29 +112,30 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip // Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(unsigned int i = 0; i < (num_points%4)*2; i++){ for(unsigned int i = 0; i < (num_points%4)*2; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
@ -150,13 +152,14 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
float min_val = -32768; float min_val = -32768;
float max_val = 32767; float max_val = 32767;
for(unsigned int i = 0; i < num_points*2; i++){ for(unsigned int i = 0; i < num_points*2; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */
@ -193,30 +196,32 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0;i < sse_iters; i++)
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; {
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip // Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(unsigned int i = 0; i < (num_points%4)*2; i++){ for(unsigned int i = 0; i < (num_points%4)*2; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -228,7 +233,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points/4; const unsigned int sse_iters = num_points/4;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
@ -243,30 +249,32 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0;i < sse_iters; i++)
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; {
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip // Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(unsigned int i = 0; i < (num_points%4)*2; i++){ for(unsigned int i = 0; i < (num_points%4)*2; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
@ -277,19 +285,21 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector; int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768; float min_val = -32768;
float max_val = 32767; float max_val = 32767;
for(unsigned int i = 0; i < num_points*2; i++){ for(unsigned int i = 0; i < num_points*2; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_32fc_convert_8ic.h * \file volk_gnsssdr_32fc_convert_8ic.h
* \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
@ -17,7 +17,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -45,7 +45,8 @@
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points/8;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
@ -61,38 +62,40 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0;i < sse_iters; i++)
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; {
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip // Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal3 = _mm_cvtps_epi32(ret3); intInputVal3 = _mm_cvtps_epi32(ret3);
intInputVal4 = _mm_cvtps_epi32(ret4); intInputVal4 = _mm_cvtps_epi32(ret4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 16; outputVectorPtr += 16;
} }
for(unsigned int i = 0; i < (num_points%4)*4; i++){ for(unsigned int i = 0; i < (num_points%4)*4; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -103,19 +106,21 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector; int8_t* outputVectorPtr = (int8_t*)outputVector;
float min_val = -128; float min_val = -128;
float max_val = 127; float max_val = 127;
for(unsigned int i = 0; i < num_points*2; i++){ for(unsigned int i = 0; i < num_points*2; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */
@ -137,7 +142,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points/8;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
@ -153,38 +159,40 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0;i < sse_iters; i++)
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; {
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip // Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal3 = _mm_cvtps_epi32(ret3); intInputVal3 = _mm_cvtps_epi32(ret3);
intInputVal4 = _mm_cvtps_epi32(ret4); intInputVal4 = _mm_cvtps_epi32(ret4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 16; outputVectorPtr += 16;
} }
for(unsigned int i = 0; i < (num_points%4)*4; i++){ for(unsigned int i = 0; i < (num_points%4)*4; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -195,19 +203,21 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector; int8_t* outputVectorPtr = (int8_t*)outputVector;
float min_val = -128; float min_val = -128;
float max_val = 127; float max_val = 127;
for(unsigned int i = 0; i < num_points*2; i++){ for(unsigned int i = 0; i < num_points*2; i++)
if(inputVectorPtr[i] > max_val) {
inputVectorPtr[i] = max_val; if(inputVectorPtr[i] > max_val)
else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = max_val;
inputVectorPtr[i] = min_val; else if(inputVectorPtr[i] < min_val)
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); inputVectorPtr[i] = min_val;
} outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_32fc_s32f_convert_8ic.h * \file volk_gnsssdr_32fc_s32f_convert_8ic.h
* \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
@ -17,7 +17,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -45,7 +45,8 @@
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points)
{
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points/8;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
@ -62,44 +63,46 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVec
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0;i < sse_iters; i++)
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; {
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, invScalar); inputVal1 = _mm_mul_ps(inputVal1, invScalar);
inputVal2 = _mm_mul_ps(inputVal2, invScalar); inputVal2 = _mm_mul_ps(inputVal2, invScalar);
inputVal3 = _mm_mul_ps(inputVal3, invScalar); inputVal3 = _mm_mul_ps(inputVal3, invScalar);
inputVal4 = _mm_mul_ps(inputVal4, invScalar); inputVal4 = _mm_mul_ps(inputVal4, invScalar);
// Clip // Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal3 = _mm_cvtps_epi32(ret3); intInputVal3 = _mm_cvtps_epi32(ret3);
intInputVal4 = _mm_cvtps_epi32(ret4); intInputVal4 = _mm_cvtps_epi32(ret4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 16; outputVectorPtr += 16;
} }
float scaled = 0; float scaled = 0;
for(unsigned int i = 0; i < (num_points%4)*4; i++){ for(unsigned int i = 0; i < (num_points%4)*4; i++)
scaled = inputVectorPtr[i]/scalar; {
if(scaled > max_val) scaled = inputVectorPtr[i]/scalar;
scaled = max_val; if(scaled > max_val)
else if(scaled < min_val) scaled = max_val;
scaled = min_val; else if(scaled < min_val)
outputVectorPtr[i] = (int8_t)rintf(scaled); scaled = min_val;
} outputVectorPtr[i] = (int8_t)rintf(scaled);
}
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -110,21 +113,23 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVec
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points)
{
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector; int8_t* outputVectorPtr = (int8_t*)outputVector;
float scaled = 0; float scaled = 0;
float min_val = -128; float min_val = -128;
float max_val = 127; float max_val = 127;
for(unsigned int i = 0; i < num_points*2; i++){ for(unsigned int i = 0; i < num_points*2; i++)
scaled = (inputVectorPtr[i])/scalar; {
if(scaled > max_val) scaled = (inputVectorPtr[i])/scalar;
scaled = max_val; if(scaled > max_val)
else if(scaled < min_val) scaled = max_val;
scaled = min_val; else if(scaled < min_val)
outputVectorPtr[i] = (int8_t)rintf(scaled); scaled = min_val;
} outputVectorPtr[i] = (int8_t)rintf(scaled);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */
@ -146,7 +151,8 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVe
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points)
{
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points/8;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
@ -163,44 +169,46 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVec
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0;i < sse_iters; i++)
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; {
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, invScalar); inputVal1 = _mm_mul_ps(inputVal1, invScalar);
inputVal2 = _mm_mul_ps(inputVal2, invScalar); inputVal2 = _mm_mul_ps(inputVal2, invScalar);
inputVal3 = _mm_mul_ps(inputVal3, invScalar); inputVal3 = _mm_mul_ps(inputVal3, invScalar);
inputVal4 = _mm_mul_ps(inputVal4, invScalar); inputVal4 = _mm_mul_ps(inputVal4, invScalar);
// Clip // Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal3 = _mm_cvtps_epi32(ret3); intInputVal3 = _mm_cvtps_epi32(ret3);
intInputVal4 = _mm_cvtps_epi32(ret4); intInputVal4 = _mm_cvtps_epi32(ret4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 16; outputVectorPtr += 16;
} }
float scaled = 0; float scaled = 0;
for(unsigned int i = 0; i < (num_points%4)*4; i++){ for(unsigned int i = 0; i < (num_points%4)*4; i++)
scaled = inputVectorPtr[i]/scalar; {
if(scaled > max_val) scaled = inputVectorPtr[i]/scalar;
scaled = max_val; if(scaled > max_val)
else if(scaled < min_val) scaled = max_val;
scaled = min_val; else if(scaled < min_val)
outputVectorPtr[i] = (int8_t)rintf(scaled); scaled = min_val;
} outputVectorPtr[i] = (int8_t)rintf(scaled);
}
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -211,21 +219,23 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVec
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points)
{
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector; int8_t* outputVectorPtr = (int8_t*)outputVector;
float scaled = 0; float scaled = 0;
float min_val = -128; float min_val = -128;
float max_val = 127; float max_val = 127;
for(unsigned int i = 0; i < num_points*2; i++){ for(unsigned int i = 0; i < num_points*2; i++)
scaled = inputVectorPtr[i]/scalar; {
if(scaled > max_val) scaled = inputVectorPtr[i]/scalar;
scaled = max_val; if(scaled > max_val)
else if(scaled < min_val) scaled = max_val;
scaled = min_val; else if(scaled < min_val)
outputVectorPtr[i] = (int8_t)rintf(scaled); scaled = min_val;
} outputVectorPtr[i] = (int8_t)rintf(scaled);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc * \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc
* \brief Volk protokernel: replaces the tracking function for update_local_code * \brief Volk protokernel: replaces the tracking function for update_local_code
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that replaces the tracking function for update_local_code * Volk protokernel that replaces the tracking function for update_local_code
@ -19,7 +19,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -43,23 +43,14 @@
#ifdef LV_HAVE_SSE4_1 #ifdef LV_HAVE_SSE4_1
#include <smmintrin.h> #include <smmintrin.h>
/*! /*!
\brief Takes the conjugate of a complex vector. \brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored \param cVector The vector where the results will be stored
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector \param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points)
{
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
// *pointer1 = 1;
// float* pointer2 = (float*)&code_length_half_chips;
// *pointer2 = 6;
// float* pointer3 = (float*)&code_phase_step_half_chips;
// *pointer3 = 7;
// float* pointer4 = (float*)&tcode_half_chips_input;
// *pointer4 = 8;
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
__m128 tquot, fmod_num, fmod_result, associated_chip_index_array; __m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
@ -74,76 +65,67 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_
__VOLK_ATTR_ALIGNED(16) int32_t output[4]; __VOLK_ATTR_ALIGNED(16) int32_t output[4];
for (unsigned int i = 0; i < sse_iters; i++) for (unsigned int i = 0; i < sse_iters; i++)
{
//fmod = numer - tquot * denom; tquot = numer/denom truncated
//associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
_mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int);
//d_very_early_code[i] = d_ca_code[associated_chip_index];
*d_very_early_code++ = d_ca_code[output[0]];
*d_very_early_code++ = d_ca_code[output[1]];
*d_very_early_code++ = d_ca_code[output[2]];
*d_very_early_code++ = d_ca_code[output[3]];
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
}
if (num_points%4!=0)
{
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
int associated_chip_index;
float tcode_half_chips = tcode_half_chips_stored[0];
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points%4; i++)
{ {
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); //fmod = numer - tquot * denom; tquot = numer/denom truncated
d_very_early_code[i] = d_ca_code[associated_chip_index]; //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
_mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int);
//d_very_early_code[i] = d_ca_code[associated_chip_index];
*d_very_early_code++ = d_ca_code[output[0]];
*d_very_early_code++ = d_ca_code[output[1]];
*d_very_early_code++ = d_ca_code[output[2]];
*d_very_early_code++ = d_ca_code[output[3]];
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
}
if (num_points % 4 != 0)
{
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
int associated_chip_index;
float tcode_half_chips = tcode_half_chips_stored[0];
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points%4; i++)
{
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
d_very_early_code[i] = d_ca_code[associated_chip_index];
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
}
} }
}
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
/*! /*!
\brief Takes the conjugate of a complex vector. \brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored \param cVector The vector where the results will be stored
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector \param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points)
{
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
// *pointer1 = 1;
// float* pointer2 = (float*)&code_length_half_chips;
// *pointer2 = 6;
// float* pointer3 = (float*)&code_phase_step_half_chips;
// *pointer3 = 7;
// float* pointer4 = (float*)&tcode_half_chips_input;
// *pointer4 = 8;
int associated_chip_index; int associated_chip_index;
float tcode_half_chips = tcode_half_chips_input; float tcode_half_chips = tcode_half_chips_input;
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points; i++) for (unsigned int i = 0; i < num_points; i++)
{ {
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
d_very_early_code[i] = d_ca_code[associated_chip_index]; d_very_early_code[i] = d_ca_code[associated_chip_index];
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -159,23 +141,14 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_3
#ifdef LV_HAVE_SSE4_1 #ifdef LV_HAVE_SSE4_1
#include <smmintrin.h> #include <smmintrin.h>
/*! /*!
\brief Takes the conjugate of a complex vector. \brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored \param cVector The vector where the results will be stored
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector \param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points)
{
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
// *pointer1 = 1;
// float* pointer2 = (float*)&code_length_half_chips;
// *pointer2 = 6;
// float* pointer3 = (float*)&code_phase_step_half_chips;
// *pointer3 = 7;
// float* pointer4 = (float*)&tcode_half_chips_input;
// *pointer4 = 8;
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
__m128 tquot, fmod_num, fmod_result, associated_chip_index_array; __m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
@ -190,77 +163,68 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_
__VOLK_ATTR_ALIGNED(16) int32_t output[4]; __VOLK_ATTR_ALIGNED(16) int32_t output[4];
for (unsigned int i = 0; i < sse_iters; i++) for (unsigned int i = 0; i < sse_iters; i++)
{
//fmod = numer - tquot * denom; tquot = numer/denom truncated
//associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
_mm_store_si128 ((__m128i*)output, associated_chip_index_array_int);
//d_very_early_code[i] = d_ca_code[associated_chip_index];
*d_very_early_code++ = d_ca_code[output[0]];
*d_very_early_code++ = d_ca_code[output[1]];
*d_very_early_code++ = d_ca_code[output[2]];
*d_very_early_code++ = d_ca_code[output[3]];
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
}
if (num_points%4!=0)
{
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
int associated_chip_index;
float tcode_half_chips = tcode_half_chips_stored[0];
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points%4; i++)
{ {
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); //fmod = numer - tquot * denom; tquot = numer/denom truncated
d_very_early_code[i] = d_ca_code[associated_chip_index]; //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
_mm_store_si128 ((__m128i*)output, associated_chip_index_array_int);
//d_very_early_code[i] = d_ca_code[associated_chip_index];
*d_very_early_code++ = d_ca_code[output[0]];
*d_very_early_code++ = d_ca_code[output[1]];
*d_very_early_code++ = d_ca_code[output[2]];
*d_very_early_code++ = d_ca_code[output[3]];
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
}
if (num_points % 4 != 0)
{
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
int associated_chip_index;
float tcode_half_chips = tcode_half_chips_stored[0];
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points%4; i++)
{
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
d_very_early_code[i] = d_ca_code[associated_chip_index];
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
}
} }
}
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
/*! /*!
\brief Takes the conjugate of a complex vector. \brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored \param cVector The vector where the results will be stored
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector \param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points)
{
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
// *pointer1 = 1;
// float* pointer2 = (float*)&code_length_half_chips;
// *pointer2 = 6;
// float* pointer3 = (float*)&code_phase_step_half_chips;
// *pointer3 = 7;
// float* pointer4 = (float*)&tcode_half_chips_input;
// *pointer4 = 8;
int associated_chip_index; int associated_chip_index;
float tcode_half_chips = tcode_half_chips_input; float tcode_half_chips = tcode_half_chips_input;
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points; i++) for (unsigned int i = 0; i < num_points; i++)
{ {
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
d_very_early_code[i] = d_ca_code[associated_chip_index]; d_very_early_code[i] = d_ca_code[associated_chip_index];
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */

View File

@ -2,8 +2,8 @@
* \file volk_gnsssdr_32fc_x5_cw_vepl_corr_32fc_x5 * \file volk_gnsssdr_32fc_x5_cw_vepl_corr_32fc_x5
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt and Late correlation with 64 bits vectors * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt and Late correlation with 64 bits vectors
* \authors <ul> * \authors <ul>
* <li>Javier Arribas, 2011. jarribas(at)cttc.es * <li> Javier Arribas, 2011. jarribas(at)cttc.es
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that performs the carrier wipe-off mixing and the * Volk protokernel that performs the carrier wipe-off mixing and the
@ -31,7 +31,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -60,7 +60,7 @@
#include <pmmintrin.h> #include <pmmintrin.h>
/*! /*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input \param input The input signal input
\param carrier The carrier signal input \param carrier The carrier signal input
@ -71,10 +71,10 @@
\param P_out Early correlation output \param P_out Early correlation output
\param L_out Early correlation output \param L_out Early correlation output
\param num_points The number of complex values in vectors \param num_points The number of complex values in vectors
*/ */
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
{ {
unsigned int number = 0; unsigned int number = 0;
const unsigned int halfPoints = num_points / 2; const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_E; lv_32fc_t dotProduct_E;
@ -100,90 +100,90 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_
const lv_32fc_t* _L_code = L_code; const lv_32fc_t* _L_code = L_code;
for(;number < halfPoints; number++) for(;number < halfPoints; number++)
{ {
// carrier wipe-off (vector point-to-point product) // carrier wipe-off (vector point-to-point product)
x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
// correlation E,P,L (3x vector scalar product) // correlation E,P,L (3x vector scalar product)
// Early // Early
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
x = z; x = z;
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt // Prompt
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
// Late // Late
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
/*pointer increment*/ /*pointer increment*/
_carrier += 2; _carrier += 2;
_input += 2; _input += 2;
//_input_BB += 2; //_input_BB += 2;
_E_code += 2; _E_code += 2;
_P_code += 2; _P_code += 2;
_L_code +=2; _L_code +=2;
} }
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
@ -199,12 +199,12 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_
dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
if((num_points % 2) != 0) if((num_points % 2) != 0)
{ {
//_input_BB = (*_input) * (*_carrier); //_input_BB = (*_input) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code)*(*_carrier); dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
dotProduct_P += (*_input) * (*_P_code)*(*_carrier); dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
dotProduct_L += (*_input) * (*_L_code)*(*_carrier); dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
} }
*E_out = dotProduct_E; *E_out = dotProduct_E;
*P_out = dotProduct_P; *P_out = dotProduct_P;
@ -237,14 +237,14 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E
*L_out = 0; *L_out = 0;
// perform Early, Prompt and Late correlation // perform Early, Prompt and Late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out += bb_signal_sample * E_code[i]; *E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i]; *P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i]; *L_out += bb_signal_sample * L_code[i];
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -277,7 +277,7 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E
*/ */
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
{ {
unsigned int number = 0; unsigned int number = 0;
const unsigned int halfPoints = num_points / 2; const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_E; lv_32fc_t dotProduct_E;
@ -303,90 +303,90 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_
const lv_32fc_t* _L_code = L_code; const lv_32fc_t* _L_code = L_code;
for(;number < halfPoints; number++) for(;number < halfPoints; number++)
{ {
// carrier wipe-off (vector point-to-point product) // carrier wipe-off (vector point-to-point product)
x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
// correlation E,P,L (3x vector scalar product) // correlation E,P,L (3x vector scalar product)
// Early // Early
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
x = z; x = z;
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt // Prompt
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
// Late // Late
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
/*pointer increment*/ /*pointer increment*/
_carrier += 2; _carrier += 2;
_input += 2; _input += 2;
//_input_BB += 2; //_input_BB += 2;
_E_code += 2; _E_code += 2;
_P_code += 2; _P_code += 2;
_L_code +=2; _L_code +=2;
} }
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
@ -402,12 +402,12 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_
dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
if((num_points % 2) != 0) if((num_points % 2) != 0)
{ {
//_input_BB = (*_input) * (*_carrier); //_input_BB = (*_input) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code)*(*_carrier); dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
dotProduct_P += (*_input) * (*_P_code)*(*_carrier); dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
dotProduct_L += (*_input) * (*_L_code)*(*_carrier); dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
} }
*E_out = dotProduct_E; *E_out = dotProduct_E;
*P_out = dotProduct_P; *P_out = dotProduct_P;
@ -440,14 +440,14 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t*
*L_out = 0; *L_out = 0;
// perform Early, Prompt and Late correlation // perform Early, Prompt and Late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out += bb_signal_sample * E_code[i]; *E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i]; *P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i]; *L_out += bb_signal_sample * L_code[i];
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */

View File

@ -2,8 +2,8 @@
* \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5 * \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5
* \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors * \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors
* \authors <ul> * \authors <ul>
* <li>Javier Arribas, 2011. jarribas(at)cttc.es * <li> Javier Arribas, 2011. jarribas(at)cttc.es
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that performs the carrier wipe-off mixing and the * Volk protokernel that performs the carrier wipe-off mixing and the
@ -35,7 +35,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -77,7 +77,7 @@
*/ */
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{ {
unsigned int number = 0; unsigned int number = 0;
const unsigned int halfPoints = num_points / 4; const unsigned int halfPoints = num_points / 4;
lv_32fc_t dotProduct_VE; lv_32fc_t dotProduct_VE;
@ -106,93 +106,93 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE
const lv_32fc_t* _VL_code = VL_code; const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++) for(;number < halfPoints; number++)
{ {
// carrier wipe-off (vector point-to-point product) // carrier wipe-off (vector point-to-point product)
x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product) // correlation VE,E,P,L,VL (5x vector scalar product)
// VE // VE
y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
// Early // Early
y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt // Prompt
y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
// Late // Late
y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
// VL // VL
y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/ /*pointer increment*/
_carrier += 4; _carrier += 4;
_input += 4; _input += 4;
_VE_code += 4; _VE_code += 4;
_E_code += 4; _E_code += 4;
_P_code += 4; _P_code += 4;
_L_code += 4; _L_code += 4;
_VL_code += 4; _VL_code += 4;
} }
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
@ -213,13 +213,13 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
for (unsigned int i = 0; i<(num_points % 4); ++i) for (unsigned int i = 0; i<(num_points % 4); ++i)
{ {
dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
} }
*VE_out = dotProduct_VE; *VE_out = dotProduct_VE;
*E_out = dotProduct_E; *E_out = dotProduct_E;
@ -231,7 +231,7 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
/*! /*!
\brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input \param input The input signal input
\param carrier The carrier signal input \param carrier The carrier signal input
@ -246,10 +246,10 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE
\param L_out Early correlation output \param L_out Early correlation output
\param VL_out VL correlation output \param VL_out VL correlation output
\param num_points The number of complex values in vectors \param num_points The number of complex values in vectors
*/ */
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{ {
unsigned int number = 0; unsigned int number = 0;
const unsigned int halfPoints = num_points / 2; const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_VE; lv_32fc_t dotProduct_VE;
@ -278,94 +278,94 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V
const lv_32fc_t* _VL_code = VL_code; const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++) for(;number < halfPoints; number++)
{ {
// carrier wipe-off (vector point-to-point product) // carrier wipe-off (vector point-to-point product)
x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product) // correlation VE,E,P,L,VL (5x vector scalar product)
// VE // VE
y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
// Early // Early
y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt // Prompt
y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
// Late // Late
y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
// VL // VL
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/ /*pointer increment*/
_carrier += 2; _carrier += 2;
_input += 2; _input += 2;
_VE_code += 2; _VE_code += 2;
_E_code += 2; _E_code += 2;
_P_code += 2; _P_code += 2;
_L_code +=2; _L_code +=2;
_VL_code +=2; _VL_code +=2;
} }
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
@ -386,13 +386,13 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
if((num_points % 2) != 0) if((num_points % 2) != 0)
{ {
dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
dotProduct_E += (*_input) * (*_E_code)*(*_carrier); dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
dotProduct_P += (*_input) * (*_P_code)*(*_carrier); dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
dotProduct_L += (*_input) * (*_L_code)*(*_carrier); dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
} }
*VE_out = dotProduct_VE; *VE_out = dotProduct_VE;
*E_out = dotProduct_E; *E_out = dotProduct_E;
@ -432,16 +432,16 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t*
*VL_out = 0; *VL_out = 0;
// perform Early, Prompt and Late correlation // perform Early, Prompt and Late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*VE_out += bb_signal_sample * VE_code[i]; *VE_out += bb_signal_sample * VE_code[i];
*E_out += bb_signal_sample * E_code[i]; *E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i]; *P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i]; *L_out += bb_signal_sample * L_code[i];
*VL_out += bb_signal_sample * VL_code[i]; *VL_out += bb_signal_sample * VL_code[i];
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -478,7 +478,7 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t*
*/ */
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{ {
unsigned int number = 0; unsigned int number = 0;
const unsigned int halfPoints = num_points / 4; const unsigned int halfPoints = num_points / 4;
lv_32fc_t dotProduct_VE; lv_32fc_t dotProduct_VE;
@ -507,93 +507,93 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE
const lv_32fc_t* _VL_code = VL_code; const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++) for(;number < halfPoints; number++)
{ {
// carrier wipe-off (vector point-to-point product) // carrier wipe-off (vector point-to-point product)
x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product) // correlation VE,E,P,L,VL (5x vector scalar product)
// VE // VE
y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
// Early // Early
y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt // Prompt
y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
// Late // Late
y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
// VL // VL
y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/ /*pointer increment*/
_carrier += 4; _carrier += 4;
_input += 4; _input += 4;
_VE_code += 4; _VE_code += 4;
_E_code += 4; _E_code += 4;
_P_code += 4; _P_code += 4;
_L_code += 4; _L_code += 4;
_VL_code += 4; _VL_code += 4;
} }
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
@ -614,13 +614,13 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
for (unsigned int i = 0; i<(num_points % 4); ++i) for (unsigned int i = 0; i<(num_points % 4); ++i)
{ {
dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
} }
*VE_out = dotProduct_VE; *VE_out = dotProduct_VE;
*E_out = dotProduct_E; *E_out = dotProduct_E;
@ -650,7 +650,7 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE
*/ */
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{ {
unsigned int number = 0; unsigned int number = 0;
const unsigned int halfPoints = num_points / 2; const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_VE; lv_32fc_t dotProduct_VE;
@ -679,94 +679,94 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V
const lv_32fc_t* _VL_code = VL_code; const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++) for(;number < halfPoints; number++)
{ {
// carrier wipe-off (vector point-to-point product) // carrier wipe-off (vector point-to-point product)
x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product) // correlation VE,E,P,L,VL (5x vector scalar product)
// VE // VE
y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
// Early // Early
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt // Prompt
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
// Late // Late
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
// VL // VL
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/ /*pointer increment*/
_carrier += 2; _carrier += 2;
_input += 2; _input += 2;
_VE_code += 2; _VE_code += 2;
_E_code += 2; _E_code += 2;
_P_code += 2; _P_code += 2;
_L_code +=2; _L_code +=2;
_VL_code +=2; _VL_code +=2;
} }
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
@ -787,13 +787,13 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
if((num_points % 2) != 0) if((num_points % 2) != 0)
{ {
dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
dotProduct_E += (*_input) * (*_E_code)*(*_carrier); dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
dotProduct_P += (*_input) * (*_P_code)*(*_carrier); dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
dotProduct_L += (*_input) * (*_L_code)*(*_carrier); dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
} }
*VE_out = dotProduct_VE; *VE_out = dotProduct_VE;
*E_out = dotProduct_E; *E_out = dotProduct_E;
@ -833,16 +833,16 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t
*VL_out = 0; *VL_out = 0;
// perform Early, Prompt and Late correlation // perform Early, Prompt and Late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*VE_out += bb_signal_sample * VE_code[i]; *VE_out += bb_signal_sample * VE_code[i];
*E_out += bb_signal_sample * E_code[i]; *E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i]; *P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i]; *L_out += bb_signal_sample * L_code[i];
*VL_out += bb_signal_sample * VL_code[i]; *VL_out += bb_signal_sample * VL_code[i];
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_64f_accumulator_64f.h * \file volk_gnsssdr_64f_accumulator_64f.h
* \brief Volk protokernel: 64 bits (double) scalar accumulator * \brief Volk protokernel: 64 bits (double) scalar accumulator
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that implements an accumulator of char values * Volk protokernel that implements an accumulator of char values
@ -19,7 +19,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8i_accumulator_s8i.h * \file volk_gnsssdr_8i_accumulator_s8i.h
* \brief Volk protokernel: 8 bits (char) scalar accumulator * \brief Volk protokernel: 8 bits (char) scalar accumulator
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that implements an accumulator of char values * Volk protokernel that implements an accumulator of char values
@ -19,7 +19,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -47,7 +47,8 @@
\param inputBuffer The buffer of data to be accumulated \param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated \param num_points The number of values in inputBuffer to be accumulated
*/ */
static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points)
{
char returnValue = 0; char returnValue = 0;
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
@ -57,20 +58,23 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
__m128i accumulator = _mm_setzero_si128(); __m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++){ for(unsigned int number = 0; number < sse_iters; number++)
aVal = _mm_lddqu_si128((__m128i*)aPtr); {
accumulator = _mm_add_epi8(accumulator, aVal); aVal = _mm_lddqu_si128((__m128i*)aPtr);
aPtr += 16; accumulator = _mm_add_epi8(accumulator, aVal);
} aPtr += 16;
}
_mm_storeu_si128((__m128i*)tempBuffer,accumulator); _mm_storeu_si128((__m128i*)tempBuffer,accumulator);
for(unsigned int i = 0; i<16; ++i){ for(unsigned int i = 0; i<16; ++i)
returnValue += tempBuffer[i]; {
} returnValue += tempBuffer[i];
}
for(unsigned int i = 0; i<(num_points % 16); ++i){ for(unsigned int i = 0; i<(num_points % 16); ++i)
returnValue += (*aPtr++); {
} returnValue += (*aPtr++);
}
*result = returnValue; *result = returnValue;
} }
@ -83,13 +87,15 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
\param inputBuffer The buffer of data to be accumulated \param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated \param num_points The number of values in inputBuffer to be accumulated
*/ */
static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points)
{
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
char returnValue = 0; char returnValue = 0;
for(unsigned int number = 0;number < num_points; number++){ for(unsigned int number = 0;number < num_points; number++)
returnValue += (*aPtr++); {
} returnValue += (*aPtr++);
}
*result = returnValue; *result = returnValue;
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -112,7 +118,8 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const c
\param inputBuffer The buffer of data to be accumulated \param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated \param num_points The number of values in inputBuffer to be accumulated
*/ */
static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points)
{
char returnValue = 0; char returnValue = 0;
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
@ -122,20 +129,22 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
__m128i accumulator = _mm_setzero_si128(); __m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++){ for(unsigned int number = 0; number < sse_iters; number++)
aVal = _mm_load_si128((__m128i*)aPtr); {
accumulator = _mm_add_epi8(accumulator, aVal); aVal = _mm_load_si128((__m128i*)aPtr);
aPtr += 16; accumulator = _mm_add_epi8(accumulator, aVal);
} aPtr += 16;
}
_mm_store_si128((__m128i*)tempBuffer,accumulator); _mm_store_si128((__m128i*)tempBuffer,accumulator);
for(unsigned int i = 0; i<16; ++i){ for(unsigned int i = 0; i<16; ++i){
returnValue += tempBuffer[i]; returnValue += tempBuffer[i];
} }
for(unsigned int i = 0; i<(num_points % 16); ++i){ for(unsigned int i = 0; i<(num_points % 16); ++i)
returnValue += (*aPtr++); {
} returnValue += (*aPtr++);
}
*result = returnValue; *result = returnValue;
} }
@ -148,13 +157,15 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
\param inputBuffer The buffer of data to be accumulated \param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated \param num_points The number of values in inputBuffer to be accumulated
*/ */
static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){ static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points)
{
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
char returnValue = 0; char returnValue = 0;
for(unsigned int number = 0;number < num_points; number++){ for(unsigned int number = 0;number < num_points; number++)
returnValue += (*aPtr++); {
} returnValue += (*aPtr++);
}
*result = returnValue; *result = returnValue;
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -167,8 +178,9 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const
\param num_points The number of values in inputBuffer to be accumulated \param num_points The number of values in inputBuffer to be accumulated
*/ */
extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points); extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){
static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points)
{
short res = 0; short res = 0;
char* resc = (char*)&res; char* resc = (char*)&res;
resc++; resc++;

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8i_index_max_16u.h * \file volk_gnsssdr_8i_index_max_16u.h
* \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars * \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars * Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars
@ -19,7 +19,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8i_max_s8i.h * \file volk_gnsssdr_8i_max_s8i.h
* \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars * \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars * Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars
@ -19,7 +19,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -47,44 +47,46 @@
\param src0 The buffer of data to be analysed \param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed \param num_points The number of values in src0 to be analysed
*/ */
static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) { static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points)
if(num_points > 0){ {
const unsigned int sse_iters = num_points / 16; if(num_points > 0)
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_loadu_si128((__m128i*)inputPtr); const unsigned int sse_iters = num_points / 16;
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 16;
}
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
for(unsigned int i = 0; i<16; ++i) maxValues = _mm_set1_epi8(max);
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(unsigned int i = 0; i<(num_points % 16); ++i) for(unsigned int number = 0; number < sse_iters; number++)
{ {
if(src0[i] > max) currentValues = _mm_loadu_si128((__m128i*)inputPtr);
{ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
max = src0[i]; maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
} inputPtr += 16;
}
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
for(unsigned int i = 0; i<16; ++i)
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(unsigned int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
} }
target = max;
}
} }
#endif /*LV_HAVE_SSE4_1*/ #endif /*LV_HAVE_SSE4_1*/
@ -97,55 +99,57 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src
\param src0 The buffer of data to be analysed \param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed \param num_points The number of values in src0 to be analysed
*/ */
static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) { static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points)
if(num_points > 0){ {
const unsigned int sse_iters = num_points / 16; if(num_points > 0)
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_loadu_si128((__m128i*)inputPtr); const unsigned int sse_iters = num_points / 16;
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
if (mask != 0xFFFF) char* inputPtr = (char*)src0;
{ char max = src0[0];
_mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues); unsigned short mask;
mask = ~mask; __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
int i = 0; __m128i maxValues, compareResults, currentValues;
while (mask > 0)
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{ {
if ((mask & 1) == 1) currentValues = _mm_loadu_si128((__m128i*)inputPtr);
{ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
if(currentValuesBuffer[i] > max) mask = _mm_movemask_epi8(compareResults);
{
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i) if (mask != 0xFFFF)
{ {
if(src0[i] > max) _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
{ mask = ~mask;
max = src0[i]; int i = 0;
} while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
} }
target = max;
}
} }
#endif /*LV_HAVE_SSE2*/ #endif /*LV_HAVE_SSE2*/
@ -157,20 +161,21 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0,
\param src0 The buffer of data to be analysed \param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed \param num_points The number of values in src0 to be analysed
*/ */
static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) { static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points)
{
if(num_points > 0) if(num_points > 0)
{
char max = src0[0];
for(unsigned int i = 1; i < num_points; ++i)
{ {
if(src0[i] > max) char max = src0[0];
{
max = src0[i]; for(unsigned int i = 1; i < num_points; ++i)
} {
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
} }
target = max;
}
} }
#endif /*LV_HAVE_GENERIC*/ #endif /*LV_HAVE_GENERIC*/
@ -193,44 +198,46 @@ static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0
\param src0 The buffer of data to be analysed \param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed \param num_points The number of values in src0 to be analysed
*/ */
static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) { static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points)
if(num_points > 0){ {
const unsigned int sse_iters = num_points / 16; if(num_points > 0)
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_load_si128((__m128i*)inputPtr); const unsigned int sse_iters = num_points / 16;
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 16;
}
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues); char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
for(unsigned int i = 0; i<16; ++i) maxValues = _mm_set1_epi8(max);
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(unsigned int i = 0; i<(num_points % 16); ++i) for(unsigned int number = 0; number < sse_iters; number++)
{ {
if(src0[i] > max) currentValues = _mm_load_si128((__m128i*)inputPtr);
{ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
max = src0[i]; maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
} inputPtr += 16;
}
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
for(unsigned int i = 0; i<16; ++i)
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(unsigned int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
} }
target = max;
}
} }
#endif /*LV_HAVE_SSE4_1*/ #endif /*LV_HAVE_SSE4_1*/
@ -243,55 +250,57 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src
\param src0 The buffer of data to be analysed \param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed \param num_points The number of values in src0 to be analysed
*/ */
static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) { static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points)
if(num_points > 0){ {
const unsigned int sse_iters = num_points / 16; if(num_points > 0)
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_load_si128((__m128i*)inputPtr); const unsigned int sse_iters = num_points / 16;
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
if (mask != 0xFFFF) char* inputPtr = (char*)src0;
{ char max = src0[0];
_mm_store_si128((__m128i*)&currentValuesBuffer, currentValues); unsigned short mask;
mask = ~mask; __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
int i = 0; __m128i maxValues, compareResults, currentValues;
while (mask > 0)
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{ {
if ((mask & 1) == 1) currentValues = _mm_load_si128((__m128i*)inputPtr);
{ compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
if(currentValuesBuffer[i] > max) mask = _mm_movemask_epi8(compareResults);
{
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i) if (mask != 0xFFFF)
{ {
if(src0[i] > max) _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
{ mask = ~mask;
max = src0[i]; int i = 0;
} while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
} }
target = max;
}
} }
#endif /*LV_HAVE_SSE2*/ #endif /*LV_HAVE_SSE2*/
@ -303,23 +312,23 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0,
\param src0 The buffer of data to be analysed \param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed \param num_points The number of values in src0 to be analysed
*/ */
static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) { static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points)
{
if(num_points > 0) if(num_points > 0)
{
if(num_points > 0)
{ {
char max = src0[0]; if(num_points > 0)
for(unsigned int i = 1; i < num_points; ++i)
{
if(src0[i] > max)
{ {
max = src0[i]; char max = src0[0];
for(unsigned int i = 1; i < num_points; ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
} }
}
target = max;
} }
}
} }
#endif /*LV_HAVE_GENERIC*/ #endif /*LV_HAVE_GENERIC*/

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8i_x2_add_8i.h * \file volk_gnsssdr_8i_x2_add_8i.h
* \brief Volk protokernel: adds pairs of 8 bits (char) scalars * \brief Volk protokernel: adds pairs of 8 bits (char) scalars
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that adds pairs of 8 bits (char) scalars * Volk protokernel that adds pairs of 8 bits (char) scalars
@ -19,7 +19,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -47,8 +47,8 @@
\param bVector One of the vectors to be added \param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector \param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/ */
static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
char* cPtr = cVector; char* cPtr = cVector;
@ -57,24 +57,24 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
__m128i aVal, bVal, cVal; __m128i aVal, bVal, cVal;
for(unsigned int number = 0; number < sse_iters; number++){ for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm_loadu_si128((__m128i*)aPtr);
bVal = _mm_loadu_si128((__m128i*)bPtr);
aVal = _mm_loadu_si128((__m128i*)aPtr); cVal = _mm_add_epi8(aVal, bVal);
bVal = _mm_loadu_si128((__m128i*)bPtr);
cVal = _mm_add_epi8(aVal, bVal); _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
_mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container aPtr += 16;
bPtr += 16;
aPtr += 16; cPtr += 16;
bPtr += 16; }
cPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i) for(unsigned int i = 0; i<(num_points % 16); ++i)
{ {
*cPtr++ = (*aPtr++) + (*bPtr++); *cPtr++ = (*aPtr++) + (*bPtr++);
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -86,15 +86,17 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
\param bVector One of the vectors to be added \param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector \param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/ */
static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
char* cPtr = cVector; char* cPtr = cVector;
const char* aPtr = aVector; const char* aPtr = aVector;
const char* bPtr= bVector; const char* bPtr= bVector;
unsigned int number = 0; unsigned int number = 0;
for(number = 0; number < num_points; number++){ for(number = 0; number < num_points; number++)
*cPtr++ = (*aPtr++) + (*bPtr++); {
} *cPtr++ = (*aPtr++) + (*bPtr++);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -116,8 +118,8 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char*
\param bVector One of the vectors to be added \param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector \param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/ */
static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
char* cPtr = cVector; char* cPtr = cVector;
@ -126,24 +128,24 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
__m128i aVal, bVal, cVal; __m128i aVal, bVal, cVal;
for(unsigned int number = 0; number < sse_iters; number++){ for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm_load_si128((__m128i*)aPtr);
bVal = _mm_load_si128((__m128i*)bPtr);
aVal = _mm_load_si128((__m128i*)aPtr); cVal = _mm_add_epi8(aVal, bVal);
bVal = _mm_load_si128((__m128i*)bPtr);
cVal = _mm_add_epi8(aVal, bVal); _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
_mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container aPtr += 16;
bPtr += 16;
aPtr += 16; cPtr += 16;
bPtr += 16; }
cPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i) for(unsigned int i = 0; i<(num_points % 16); ++i)
{ {
*cPtr++ = (*aPtr++) + (*bPtr++); *cPtr++ = (*aPtr++) + (*bPtr++);
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -155,15 +157,17 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
\param bVector One of the vectors to be added \param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector \param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/ */
static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
char* cPtr = cVector; char* cPtr = cVector;
const char* aPtr = aVector; const char* aPtr = aVector;
const char* bPtr= bVector; const char* bPtr= bVector;
unsigned int number = 0; unsigned int number = 0;
for(number = 0; number < num_points; number++){ for(number = 0; number < num_points; number++)
*cPtr++ = (*aPtr++) + (*bPtr++); {
} *cPtr++ = (*aPtr++) + (*bPtr++);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -176,7 +180,8 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char
\param num_points The number of values in aVector and bVector to be added together and stored into cVector \param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/ */
extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points); extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);
static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points); volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points);
} }
#endif /* LV_HAVE_ORC */ #endif /* LV_HAVE_ORC */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8ic_conjugate_8ic.h * \file volk_gnsssdr_8ic_conjugate_8ic.h
* \brief Volk protokernel: calculates the conjugate of a 16 bits vector * \brief Volk protokernel: calculates the conjugate of a 16 bits vector
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that calculates the conjugate of a * Volk protokernel that calculates the conjugate of a
@ -20,7 +20,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -48,7 +48,8 @@
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
lv_8sc_t* c = cVector; lv_8sc_t* c = cVector;
@ -60,25 +61,25 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (unsigned int i = 0; i < sse_iters; ++i) for (unsigned int i = 0; i < sse_iters; ++i)
{ {
tmp = _mm256_loadu_ps((float*)a); tmp = _mm256_loadu_ps((float*)a);
tmp = _mm256_xor_ps(tmp, conjugator1); tmp = _mm256_xor_ps(tmp, conjugator1);
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
_mm256_storeu_ps((float*)c, tmp); _mm256_storeu_ps((float*)c, tmp);
a += 16; a += 16;
c += 16; c += 16;
} }
for (unsigned int i = 0; i<(num_points % 16); ++i) for (unsigned int i = 0; i<(num_points % 16); ++i)
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_AVX */ #endif /* LV_HAVE_AVX */
@ -90,7 +91,8 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
lv_8sc_t* c = cVector; lv_8sc_t* c = cVector;
@ -100,18 +102,18 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con
__m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (unsigned int i = 0; i < sse_iters; ++i) for (unsigned int i = 0; i < sse_iters; ++i)
{ {
tmp = _mm_lddqu_si128((__m128i*)a); tmp = _mm_lddqu_si128((__m128i*)a);
tmp = _mm_sign_epi8(tmp, conjugator); tmp = _mm_sign_epi8(tmp, conjugator);
_mm_storeu_si128((__m128i*)c, tmp); _mm_storeu_si128((__m128i*)c, tmp);
a += 8; a += 8;
c += 8; c += 8;
} }
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSSE3 */ #endif /* LV_HAVE_SSSE3 */
@ -124,7 +126,8 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
lv_8sc_t* c = cVector; lv_8sc_t* c = cVector;
@ -135,19 +138,19 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (unsigned int i = 0; i < sse_iters; ++i) for (unsigned int i = 0; i < sse_iters; ++i)
{ {
tmp = _mm_lddqu_si128((__m128i*)a); tmp = _mm_lddqu_si128((__m128i*)a);
tmp = _mm_xor_si128(tmp, conjugator1); tmp = _mm_xor_si128(tmp, conjugator1);
tmp = _mm_add_epi8(tmp, conjugator2); tmp = _mm_add_epi8(tmp, conjugator2);
_mm_storeu_si128((__m128i*)c, tmp); _mm_storeu_si128((__m128i*)c, tmp);
a += 8; a += 8;
c += 8; c += 8;
} }
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -159,14 +162,16 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
lv_8sc_t* cPtr = cVector; lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector; const lv_8sc_t* aPtr = aVector;
unsigned int number = 0; unsigned int number = 0;
for(number = 0; number < num_points; number++){ for(number = 0; number < num_points; number++)
*cPtr++ = lv_conj(*aPtr++); {
} *cPtr++ = lv_conj(*aPtr++);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -188,7 +193,8 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
lv_8sc_t* c = cVector; lv_8sc_t* c = cVector;
@ -200,25 +206,25 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (unsigned int i = 0; i < sse_iters; ++i) for (unsigned int i = 0; i < sse_iters; ++i)
{ {
tmp = _mm256_load_ps((float*)a); tmp = _mm256_load_ps((float*)a);
tmp = _mm256_xor_ps(tmp, conjugator1); tmp = _mm256_xor_ps(tmp, conjugator1);
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
_mm256_store_ps((float*)c, tmp); _mm256_store_ps((float*)c, tmp);
a += 16; a += 16;
c += 16; c += 16;
} }
for (unsigned int i = 0; i<(num_points % 16); ++i) for (unsigned int i = 0; i<(num_points % 16); ++i)
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_AVX */ #endif /* LV_HAVE_AVX */
@ -230,7 +236,8 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
lv_8sc_t* c = cVector; lv_8sc_t* c = cVector;
@ -240,18 +247,18 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, con
__m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (unsigned int i = 0; i < sse_iters; ++i) for (unsigned int i = 0; i < sse_iters; ++i)
{ {
tmp = _mm_load_si128((__m128i*)a); tmp = _mm_load_si128((__m128i*)a);
tmp = _mm_sign_epi8(tmp, conjugator); tmp = _mm_sign_epi8(tmp, conjugator);
_mm_store_si128((__m128i*)c, tmp); _mm_store_si128((__m128i*)c, tmp);
a += 8; a += 8;
c += 8; c += 8;
} }
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSSE3 */ #endif /* LV_HAVE_SSSE3 */
@ -264,7 +271,8 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, con
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
lv_8sc_t* c = cVector; lv_8sc_t* c = cVector;
@ -275,19 +283,19 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (unsigned int i = 0; i < sse_iters; ++i) for (unsigned int i = 0; i < sse_iters; ++i)
{ {
tmp = _mm_load_si128((__m128i*)a); tmp = _mm_load_si128((__m128i*)a);
tmp = _mm_xor_si128(tmp, conjugator1); tmp = _mm_xor_si128(tmp, conjugator1);
tmp = _mm_add_epi8(tmp, conjugator2); tmp = _mm_add_epi8(tmp, conjugator2);
_mm_store_si128((__m128i*)c, tmp); _mm_store_si128((__m128i*)c, tmp);
a += 8; a += 8;
c += 8; c += 8;
} }
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -299,14 +307,16 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons
\param aVector Vector to be conjugated \param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
lv_8sc_t* cPtr = cVector; lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector; const lv_8sc_t* aPtr = aVector;
unsigned int number = 0; unsigned int number = 0;
for(number = 0; number < num_points; number++){ for(number = 0; number < num_points; number++)
*cPtr++ = lv_conj(*aPtr++); {
} *cPtr++ = lv_conj(*aPtr++);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -318,7 +328,8 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, c
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/ */
extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points); extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points);
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points); volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points);
} }
#endif /* LV_HAVE_ORC */ #endif /* LV_HAVE_ORC */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8ic_magnitude_squared_8i.h * \file volk_gnsssdr_8ic_magnitude_squared_8i.h
* \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector * \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that calculates the magnitude squared of a * Volk protokernel that calculates the magnitude squared of a
@ -21,7 +21,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -49,8 +49,8 @@
\param magnitudeVector The vector containing the real output values \param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector \param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
const char* complexVectorPtr = (char*)complexVector; const char* complexVectorPtr = (char*)complexVector;
@ -65,40 +65,38 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(unsigned int number = 0;number < sse_iters; number++) for(unsigned int number = 0;number < sse_iters; number++)
{ {
avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero); avectorlo = _mm_unpacklo_epi8 (avector, zero);
avectorhi = _mm_unpackhi_epi8 (avector, zero); avectorhi = _mm_unpackhi_epi8 (avector, zero);
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
complexVectorPtr += 16; complexVectorPtr += 16;
bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
bvectorlo = _mm_unpacklo_epi8 (bvector, zero); bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
bvectorhi = _mm_unpackhi_epi8 (bvector, zero); bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
complexVectorPtr += 16; complexVectorPtr += 16;
result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
_mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8); _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8);
magnitudeVectorPtr += 16; magnitudeVectorPtr += 16;
}
}
for (unsigned int i = 0; i<(num_points % 16); ++i) for (unsigned int i = 0; i<(num_points % 16); ++i)
{ {
const char valReal = *complexVectorPtr++; const char valReal = *complexVectorPtr++;
const char valImag = *complexVectorPtr++; const char valImag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
} }
} }
#endif /* LV_HAVE_SSSE3 */ #endif /* LV_HAVE_SSSE3 */
@ -155,15 +153,17 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
\param magnitudeVector The vector containing the real output values \param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector \param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
{
const char* complexVectorPtr = (char*)complexVector; const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector; char* magnitudeVectorPtr = magnitudeVector;
for(unsigned int number = 0; number < num_points; number++){ for(unsigned int number = 0; number < num_points; number++)
const char real = *complexVectorPtr++; {
const char imag = *complexVectorPtr++; const char real = *complexVectorPtr++;
*magnitudeVectorPtr++ = (real*real) + (imag*imag); const char imag = *complexVectorPtr++;
} *magnitudeVectorPtr++ = (real*real) + (imag*imag);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -185,8 +185,8 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitude
\param magnitudeVector The vector containing the real output values \param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector \param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
const char* complexVectorPtr = (char*)complexVector; const char* complexVectorPtr = (char*)complexVector;
@ -201,40 +201,38 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(unsigned int number = 0;number < sse_iters; number++) for(unsigned int number = 0;number < sse_iters; number++)
{ {
avector = _mm_load_si128((__m128i*)complexVectorPtr); avector = _mm_load_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero); avectorlo = _mm_unpacklo_epi8 (avector, zero);
avectorhi = _mm_unpackhi_epi8 (avector, zero); avectorhi = _mm_unpackhi_epi8 (avector, zero);
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
complexVectorPtr += 16; complexVectorPtr += 16;
bvector = _mm_load_si128((__m128i*)complexVectorPtr); bvector = _mm_load_si128((__m128i*)complexVectorPtr);
bvectorlo = _mm_unpacklo_epi8 (bvector, zero); bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
bvectorhi = _mm_unpackhi_epi8 (bvector, zero); bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
complexVectorPtr += 16; complexVectorPtr += 16;
result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
_mm_store_si128((__m128i*)magnitudeVectorPtr, result8); _mm_store_si128((__m128i*)magnitudeVectorPtr, result8);
magnitudeVectorPtr += 16; magnitudeVectorPtr += 16;
}
}
for (unsigned int i = 0; i<(num_points % 16); ++i) for (unsigned int i = 0; i<(num_points % 16); ++i)
{ {
const char valReal = *complexVectorPtr++; const char valReal = *complexVectorPtr++;
const char valImag = *complexVectorPtr++; const char valImag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
} }
} }
#endif /* LV_HAVE_SSSE3 */ #endif /* LV_HAVE_SSSE3 */
@ -291,15 +289,17 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
\param magnitudeVector The vector containing the real output values \param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector \param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
{
const char* complexVectorPtr = (char*)complexVector; const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector; char* magnitudeVectorPtr = magnitudeVector;
for(unsigned int number = 0; number < num_points; number++){ for(unsigned int number = 0; number < num_points; number++)
const char real = *complexVectorPtr++; {
const char imag = *complexVectorPtr++; const char real = *complexVectorPtr++;
*magnitudeVectorPtr++ = (real*real) + (imag*imag); const char imag = *complexVectorPtr++;
} *magnitudeVectorPtr++ = (real*real) + (imag*imag);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -311,7 +311,8 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitu
\param num_points The number of complex values in complexVector to be calculated and stored into cVector \param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/ */
extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points); extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points);
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
{
volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points); volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points);
} }
#endif /* LV_HAVE_ORC */ #endif /* LV_HAVE_ORC */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h
* \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector * \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that multiplies a group of 16 bits vectors * Volk protokernel that multiplies a group of 16 bits vectors
@ -20,7 +20,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -50,8 +50,8 @@
\param scalar The complex scalar to multiply aVector \param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
@ -66,37 +66,37 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
imagy = _mm_and_si128 (imagy, mult1); imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1); realy = _mm_and_si128 (y, mult1);
for(unsigned int number = 0;number < sse_iters; number++){ for(unsigned int number = 0;number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
x = _mm_lddqu_si128((__m128i*)a); imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagx = _mm_srli_si128 (x, 1); realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx = _mm_and_si128 (imagx, mult1); imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx = _mm_and_si128 (x, mult1); realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realx_mult_realy = _mm_mullo_epi16 (realx, realy); realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); realc = _mm_and_si128 (realc, mult1);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); totalc = _mm_or_si128 (realc, imagc);
realc = _mm_and_si128 (realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_or_si128 (realc, imagc); _mm_storeu_si128((__m128i*)c, totalc);
_mm_storeu_si128((__m128i*)c, totalc); a += 8;
c += 8;
a += 8; }
c += 8;
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = (*a++) * scalar; *c++ = (*a++) * scalar;
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -109,8 +109,8 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
\param scalar The complex scalar to multiply aVector \param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
{
/*lv_8sc_t* cPtr = cVector; /*lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector; const lv_8sc_t* aPtr = aVector;
@ -124,17 +124,18 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector,
unsigned int number = num_points; unsigned int number = num_points;
// unwrap loop // unwrap loop
while (number >= 8){ while (number >= 8)
*cPtr++ = (*aPtr++) * scalar; {
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
number -= 8; *cPtr++ = (*aPtr++) * scalar;
} number -= 8;
}
// clean up any remaining // clean up any remaining
while (number-- > 0) while (number-- > 0)
@ -162,8 +163,8 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector,
\param scalar The complex scalar to multiply aVector \param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
@ -178,37 +179,37 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
imagy = _mm_and_si128 (imagy, mult1); imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1); realy = _mm_and_si128 (y, mult1);
for(unsigned int number = 0;number < sse_iters; number++){ for(unsigned int number = 0;number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
x = _mm_load_si128((__m128i*)a); imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagx = _mm_srli_si128 (x, 1); realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx = _mm_and_si128 (imagx, mult1); imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx = _mm_and_si128 (x, mult1); realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realx_mult_realy = _mm_mullo_epi16 (realx, realy); realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); realc = _mm_and_si128 (realc, mult1);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); totalc = _mm_or_si128 (realc, imagc);
realc = _mm_and_si128 (realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_or_si128 (realc, imagc); _mm_store_si128((__m128i*)c, totalc);
_mm_store_si128((__m128i*)c, totalc); a += 8;
c += 8;
a += 8; }
c += 8;
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = (*a++) * scalar; *c++ = (*a++) * scalar;
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -221,8 +222,8 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
\param scalar The complex scalar to multiply aVector \param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
{
/*lv_8sc_t* cPtr = cVector; /*lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector; const lv_8sc_t* aPtr = aVector;
@ -237,15 +238,15 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVecto
// unwrap loop // unwrap loop
while (number >= 8){ while (number >= 8){
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar; *cPtr++ = (*aPtr++) * scalar;
number -= 8; number -= 8;
} }
// clean up any remaining // clean up any remaining
@ -263,7 +264,8 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVecto
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/ */
extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points); extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points);
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
{
volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points); volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points);
} }
#endif /* LV_HAVE_ORC */ #endif /* LV_HAVE_ORC */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h
* \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
@ -20,7 +20,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -49,8 +49,8 @@
\param bVector One of the vectors to be multiplied and accumulated \param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
{
/*lv_8sc_t* cPtr = result; /*lv_8sc_t* cPtr = result;
const lv_8sc_t* aPtr = input; const lv_8sc_t* aPtr = input;
const lv_8sc_t* bPtr = taps; const lv_8sc_t* bPtr = taps;
@ -69,23 +69,25 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
char sum1[2] = {0,0}; char sum1[2] = {0,0};
unsigned int i = 0; unsigned int i = 0;
for(i = 0; i < n_2_ccomplex_blocks; ++i) { for(i = 0; i < n_2_ccomplex_blocks; ++i)
sum0[0] += in[0] * tp[0] - in[1] * tp[1]; {
sum0[1] += in[0] * tp[1] + in[1] * tp[0]; sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum1[0] += in[2] * tp[2] - in[3] * tp[3]; sum0[1] += in[0] * tp[1] + in[1] * tp[0];
sum1[1] += in[2] * tp[3] + in[3] * tp[2]; sum1[0] += in[2] * tp[2] - in[3] * tp[3];
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
in += 4; in += 4;
tp += 4; tp += 4;
} }
res[0] = sum0[0] + sum1[0]; res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1]; res[1] = sum0[1] + sum1[1];
// Cleanup if we had an odd number of points // Cleanup if we had an odd number of points
for(i = 0; i < isodd; ++i) { for(i = 0; i < isodd; ++i)
*result += input[num_points - 1] * taps[num_points - 1]; {
} *result += input[num_points - 1] * taps[num_points - 1];
}
} }
#endif /*LV_HAVE_GENERIC*/ #endif /*LV_HAVE_GENERIC*/
@ -99,8 +101,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
\param bVector One of the vectors to be multiplied and accumulated \param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
{
lv_8sc_t dotProduct; lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char)); memset(&dotProduct, 0x0, 2*sizeof(char));
@ -110,61 +112,61 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points/8;
if (sse_iters>0) if (sse_iters>0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++){
x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
realcacc = _mm_and_si128 (realcacc, mult1);
imagcacc = _mm_and_si128 (imagcacc, mult1);
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_or_si128 (realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (int i = 0; i<8; ++i)
{ {
dotProduct += dotProductVector[i]; __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
{
x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
realcacc = _mm_and_si128 (realcacc, mult1);
imagcacc = _mm_and_si128 (imagcacc, mult1);
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_or_si128 (realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (int i = 0; i<8; ++i)
{
dotProduct += dotProductVector[i];
}
} }
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
dotProduct += (*a++) * (*b++); dotProduct += (*a++) * (*b++);
} }
*result = dotProduct; *result = dotProduct;
} }
@ -180,8 +182,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
\param bVector One of the vectors to be multiplied and accumulated \param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
{
lv_8sc_t dotProduct; lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char)); memset(&dotProduct, 0x0, 2*sizeof(char));
@ -191,59 +193,59 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points/8;
if (sse_iters>0) if (sse_iters>0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++){
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<8; ++i)
{ {
dotProduct += dotProductVector[i]; __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<8; ++i)
{
dotProduct += dotProductVector[i];
}
} }
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
dotProduct += (*a++) * (*b++); dotProduct += (*a++) * (*b++);
} }
*result = dotProduct; *result = dotProduct;
} }
@ -270,8 +272,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
\param bVector One of the vectors to be multiplied and accumulated \param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
{
/*lv_8sc_t* cPtr = result; /*lv_8sc_t* cPtr = result;
const lv_8sc_t* aPtr = input; const lv_8sc_t* aPtr = input;
const lv_8sc_t* bPtr = taps; const lv_8sc_t* bPtr = taps;
@ -290,23 +292,25 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result,
char sum1[2] = {0,0}; char sum1[2] = {0,0};
unsigned int i = 0; unsigned int i = 0;
for(i = 0; i < n_2_ccomplex_blocks; ++i) { for(i = 0; i < n_2_ccomplex_blocks; ++i)
sum0[0] += in[0] * tp[0] - in[1] * tp[1]; {
sum0[1] += in[0] * tp[1] + in[1] * tp[0]; sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum1[0] += in[2] * tp[2] - in[3] * tp[3]; sum0[1] += in[0] * tp[1] + in[1] * tp[0];
sum1[1] += in[2] * tp[3] + in[3] * tp[2]; sum1[0] += in[2] * tp[2] - in[3] * tp[3];
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
in += 4; in += 4;
tp += 4; tp += 4;
} }
res[0] = sum0[0] + sum1[0]; res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1]; res[1] = sum0[1] + sum1[1];
// Cleanup if we had an odd number of points // Cleanup if we had an odd number of points
for(i = 0; i < isodd; ++i) { for(i = 0; i < isodd; ++i)
*result += input[num_points - 1] * taps[num_points - 1]; {
} *result += input[num_points - 1] * taps[num_points - 1];
}
} }
#endif /*LV_HAVE_GENERIC*/ #endif /*LV_HAVE_GENERIC*/
@ -320,8 +324,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result,
\param bVector One of the vectors to be multiplied and accumulated \param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
{
lv_8sc_t dotProduct; lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char)); memset(&dotProduct, 0x0, 2*sizeof(char));
@ -331,61 +335,61 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points/8;
if (sse_iters>0) if (sse_iters>0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++){
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
realcacc = _mm_and_si128 (realcacc, mult1);
imagcacc = _mm_and_si128 (imagcacc, mult1);
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_or_si128 (realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<8; ++i)
{ {
dotProduct += dotProductVector[i]; __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
realcacc = _mm_and_si128 (realcacc, mult1);
imagcacc = _mm_and_si128 (imagcacc, mult1);
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_or_si128 (realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<8; ++i)
{
dotProduct += dotProductVector[i];
}
} }
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
dotProduct += (*a++) * (*b++); dotProduct += (*a++) * (*b++);
} }
*result = dotProduct; *result = dotProduct;
} }
@ -401,8 +405,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
\param bVector One of the vectors to be multiplied and accumulated \param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
{
lv_8sc_t dotProduct; lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char)); memset(&dotProduct, 0x0, 2*sizeof(char));
@ -412,59 +416,59 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points/8;
if (sse_iters>0) if (sse_iters>0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++){
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<8; ++i)
{ {
dotProduct += dotProductVector[i]; __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<8; ++i)
{
dotProduct += dotProductVector[i];
}
} }
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
dotProduct += (*a++) * (*b++); dotProduct += (*a++) * (*b++);
} }
*result = dotProduct; *result = dotProduct;
} }
@ -480,8 +484,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/ */
extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points); extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points);
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
{
short resReal = 0; short resReal = 0;
char* resRealChar = (char*)&resReal; char* resRealChar = (char*)&resReal;
resRealChar++; resRealChar++;

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8ic_x2_multiply_8ic.h * \file volk_gnsssdr_8ic_x2_multiply_8ic.h
* \brief Volk protokernel: multiplies two 16 bits vectors * \brief Volk protokernel: multiplies two 16 bits vectors
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
@ -20,7 +20,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -49,8 +49,8 @@
\param bVector One of the vectors to be multiplied \param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
@ -60,43 +60,43 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0;number < sse_iters; number++){ for(unsigned int number = 0;number < sse_iters; number++)
{
x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b);
x = _mm_loadu_si128((__m128i*)a); imagx = _mm_srli_si128 (x, 1);
y = _mm_loadu_si128((__m128i*)b); imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagx = _mm_srli_si128 (x, 1); imagy = _mm_srli_si128 (y, 1);
imagx = _mm_and_si128 (imagx, mult1); imagy = _mm_and_si128 (imagy, mult1);
realx = _mm_and_si128 (x, mult1); realy = _mm_and_si128 (y, mult1);
imagy = _mm_srli_si128 (y, 1); realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagy = _mm_and_si128 (imagy, mult1); imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realy = _mm_and_si128 (y, mult1); realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realx_mult_realy = _mm_mullo_epi16 (realx, realy); realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); realc = _mm_and_si128 (realc, mult1);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); totalc = _mm_or_si128 (realc, imagc);
realc = _mm_and_si128 (realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_or_si128 (realc, imagc); _mm_storeu_si128((__m128i*)c, totalc);
_mm_storeu_si128((__m128i*)c, totalc); a += 8;
b += 8;
a += 8; c += 8;
b += 8; }
c += 8;
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -109,8 +109,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
\param bVector One of the vectors to be multiplied \param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
__m128i x, y, zero; __m128i x, y, zero;
@ -122,41 +122,41 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
zero = _mm_setzero_si128(); zero = _mm_setzero_si128();
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0;number < sse_iters; number++){ for(unsigned int number = 0;number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
x = _mm_lddqu_si128((__m128i*)a); imagx = _mm_srli_si128 (x, 1);
y = _mm_lddqu_si128((__m128i*)b); imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagx = _mm_srli_si128 (x, 1); imagy = _mm_srli_si128 (y, 1);
imagx = _mm_and_si128 (imagx, mult1); imagy = _mm_and_si128 (imagy, mult1);
realx = _mm_and_si128 (x, mult1); realy = _mm_and_si128 (y, mult1);
imagy = _mm_srli_si128 (y, 1); realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagy = _mm_and_si128 (imagy, mult1); imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realy = _mm_and_si128 (y, mult1); realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realx_mult_realy = _mm_mullo_epi16 (realx, realy); realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); imagc = _mm_slli_si128 (imagc, 1);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); totalc = _mm_blendv_epi8 (imagc, realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_blendv_epi8 (imagc, realc, mult1); _mm_storeu_si128((__m128i*)c, totalc);
_mm_storeu_si128((__m128i*)c, totalc); a += 8;
b += 8;
a += 8; c += 8;
b += 8; }
c += 8;
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -168,14 +168,16 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
\param bVector One of the vectors to be multiplied \param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
lv_8sc_t* cPtr = cVector; lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector; const lv_8sc_t* aPtr = aVector;
const lv_8sc_t* bPtr = bVector; const lv_8sc_t* bPtr = bVector;
for(unsigned int number = 0; number < num_points; number++){ for(unsigned int number = 0; number < num_points; number++)
*cPtr++ = (*aPtr++) * (*bPtr++); {
} *cPtr++ = (*aPtr++) * (*bPtr++);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -198,8 +200,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c
\param bVector One of the vectors to be multiplied \param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
@ -209,43 +211,43 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0;number < sse_iters; number++){ for(unsigned int number = 0;number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
x = _mm_load_si128((__m128i*)a); imagx = _mm_srli_si128 (x, 1);
y = _mm_load_si128((__m128i*)b); imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagx = _mm_srli_si128 (x, 1); imagy = _mm_srli_si128 (y, 1);
imagx = _mm_and_si128 (imagx, mult1); imagy = _mm_and_si128 (imagy, mult1);
realx = _mm_and_si128 (x, mult1); realy = _mm_and_si128 (y, mult1);
imagy = _mm_srli_si128 (y, 1); realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagy = _mm_and_si128 (imagy, mult1); imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realy = _mm_and_si128 (y, mult1); realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realx_mult_realy = _mm_mullo_epi16 (realx, realy); realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); realc = _mm_and_si128 (realc, mult1);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); totalc = _mm_or_si128 (realc, imagc);
realc = _mm_and_si128 (realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_or_si128 (realc, imagc); _mm_store_si128((__m128i*)c, totalc);
_mm_store_si128((__m128i*)c, totalc); a += 8;
b += 8;
a += 8; c += 8;
b += 8; }
c += 8;
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -258,8 +260,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
\param bVector One of the vectors to be multiplied \param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
__m128i x, y, zero; __m128i x, y, zero;
@ -271,41 +273,41 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
zero = _mm_setzero_si128(); zero = _mm_setzero_si128();
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0;number < sse_iters; number++){ for(unsigned int number = 0;number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
x = _mm_load_si128((__m128i*)a); imagx = _mm_srli_si128 (x, 1);
y = _mm_load_si128((__m128i*)b); imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagx = _mm_srli_si128 (x, 1); imagy = _mm_srli_si128 (y, 1);
imagx = _mm_and_si128 (imagx, mult1); imagy = _mm_and_si128 (imagy, mult1);
realx = _mm_and_si128 (x, mult1); realy = _mm_and_si128 (y, mult1);
imagy = _mm_srli_si128 (y, 1); realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagy = _mm_and_si128 (imagy, mult1); imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realy = _mm_and_si128 (y, mult1); realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realx_mult_realy = _mm_mullo_epi16 (realx, realy); realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); imagc = _mm_slli_si128 (imagc, 1);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); totalc = _mm_blendv_epi8 (imagc, realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_blendv_epi8 (imagc, realc, mult1); _mm_store_si128((__m128i*)c, totalc);
_mm_store_si128((__m128i*)c, totalc); a += 8;
b += 8;
a += 8; c += 8;
b += 8; }
c += 8;
}
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = 0; i<(num_points % 8); ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -317,14 +319,16 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
\param bVector One of the vectors to be multiplied \param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/ */
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
lv_8sc_t* cPtr = cVector; lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector; const lv_8sc_t* aPtr = aVector;
const lv_8sc_t* bPtr = bVector; const lv_8sc_t* bPtr = bVector;
for(unsigned int number = 0; number < num_points; number++){ for(unsigned int number = 0; number < num_points; number++)
*cPtr++ = (*aPtr++) * (*bPtr++); {
} *cPtr++ = (*aPtr++) * (*bPtr++);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -338,7 +342,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector,
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/ */
extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points); extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points);
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points); volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points);
} }
#endif /* LV_HAVE_ORC */ #endif /* LV_HAVE_ORC */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32. * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32.
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that performs the carrier wipe-off mixing and the * Volk protokernel that performs the carrier wipe-off mixing and the
@ -31,7 +31,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -57,6 +57,7 @@
#include <smmintrin.h> #include <smmintrin.h>
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" #include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h" #include "CommonMacros/CommonMacros.h"
/*! /*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input \param input The input signal input
@ -101,72 +102,72 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E
P_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{ {
*E_out_ptr += E_dotProductVector[i]; for(int number = 0;number < sse_iters; number++)
*P_out_ptr += P_dotProductVector[i]; {
*L_out_ptr += L_dotProductVector[i]; //Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
}
} }
}
lv_8sc_t bb_signal_sample; lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i) for(int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -218,75 +219,75 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_o
P_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(unsigned int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_loadu_si128((__m128i*)input_ptr);
y = _mm_loadu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_loadu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_loadu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_loadu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<2; ++i)
{ {
*E_out_ptr += E_dotProductVector[i]; for(unsigned int number = 0;number < sse_iters; number++)
*P_out_ptr += P_dotProductVector[i]; {
*L_out_ptr += L_dotProductVector[i]; //Perform the carrier wipe-off
x = _mm_loadu_si128((__m128i*)input_ptr);
y = _mm_loadu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_loadu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_loadu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_loadu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<2; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
}
} }
}
lv_8sc_t bb_signal_sample; lv_8sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points%8; ++i) for(unsigned int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -314,14 +315,14 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_
*L_out = 0; *L_out = 0;
// perform Early, Prompt and Late correlation // perform Early, Prompt and Late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -386,72 +387,72 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E
P_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{ {
*E_out_ptr += E_dotProductVector[i]; for(int number = 0;number < sse_iters; number++)
*P_out_ptr += P_dotProductVector[i]; {
*L_out_ptr += L_dotProductVector[i]; //Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
}
} }
}
lv_8sc_t bb_signal_sample; lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i) for(int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -503,75 +504,75 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_o
P_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(unsigned int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<2; ++i)
{ {
*E_out_ptr += E_dotProductVector[i]; for(unsigned int number = 0;number < sse_iters; number++)
*P_out_ptr += P_dotProductVector[i]; {
*L_out_ptr += L_dotProductVector[i]; //Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<2; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
}
} }
}
lv_8sc_t bb_signal_sample; lv_8sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points%8; ++i) for(unsigned int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -599,14 +600,14 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t*
*L_out = 0; *L_out = 0;
// perform Early, Prompt and Late correlation // perform Early, Prompt and Late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each // Now get early, late, and prompt values for each
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between 3 and 3 (2 bits). * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between 3 and 3 (2 bits).
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that performs the carrier wipe-off mixing and the * Volk protokernel that performs the carrier wipe-off mixing and the
@ -50,7 +50,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -132,96 +132,96 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t*
VL_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{ {
*VE_out_ptr += VE_dotProductVector[i]; for(int number = 0;number < sse_iters; number++)
*E_out_ptr += E_dotProductVector[i]; {
*P_out_ptr += P_dotProductVector[i]; //Perform the carrier wipe-off
*L_out_ptr += L_dotProductVector[i]; x = _mm_lddqu_si128((__m128i*)input_ptr);
*VL_out_ptr += VL_dotProductVector[i]; y = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*VE_out_ptr += VE_dotProductVector[i];
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*VL_out_ptr += VL_dotProductVector[i];
}
} }
}
lv_8sc_t bb_signal_sample; lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i) for(int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -285,101 +285,101 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE
VL_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(unsigned int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_loadu_si128((__m128i*)input_ptr);
y = _mm_loadu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_loadu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
//Get early values
y = _mm_loadu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_loadu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_loadu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
//Get very late values
y = _mm_loadu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<2; ++i)
{ {
*VE_out_ptr += VE_dotProductVector[i]; for(unsigned int number = 0;number < sse_iters; number++)
*E_out_ptr += E_dotProductVector[i]; {
*P_out_ptr += P_dotProductVector[i]; //Perform the carrier wipe-off
*L_out_ptr += L_dotProductVector[i]; x = _mm_loadu_si128((__m128i*)input_ptr);
*VL_out_ptr += VL_dotProductVector[i]; y = _mm_loadu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_loadu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
//Get early values
y = _mm_loadu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_loadu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_loadu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
//Get very late values
y = _mm_loadu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<2; ++i)
{
*VE_out_ptr += VE_dotProductVector[i];
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*VL_out_ptr += VL_dotProductVector[i];
}
} }
}
lv_8sc_t bb_signal_sample; lv_8sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points%8; ++i) for(unsigned int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -413,16 +413,16 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* V
*VL_out = 0; *VL_out = 0;
// perform very early, Early, Prompt, Late and very late correlation // perform very early, Early, Prompt, Late and very late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -499,96 +499,96 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t*
VL_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{ {
*VE_out_ptr += VE_dotProductVector[i]; for(int number = 0;number < sse_iters; number++)
*E_out_ptr += E_dotProductVector[i]; {
*P_out_ptr += P_dotProductVector[i]; //Perform the carrier wipe-off
*L_out_ptr += L_dotProductVector[i]; x = _mm_load_si128((__m128i*)input_ptr);
*VL_out_ptr += VL_dotProductVector[i]; y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*VE_out_ptr += VE_dotProductVector[i];
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*VL_out_ptr += VL_dotProductVector[i];
}
} }
}
lv_8sc_t bb_signal_sample; lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i) for(int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -652,101 +652,101 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE
VL_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(unsigned int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<2; ++i)
{ {
*VE_out_ptr += VE_dotProductVector[i]; for(unsigned int number = 0;number < sse_iters; number++)
*E_out_ptr += E_dotProductVector[i]; {
*P_out_ptr += P_dotProductVector[i]; //Perform the carrier wipe-off
*L_out_ptr += L_dotProductVector[i]; x = _mm_load_si128((__m128i*)input_ptr);
*VL_out_ptr += VL_dotProductVector[i]; y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (unsigned int i = 0; i<2; ++i)
{
*VE_out_ptr += VE_dotProductVector[i];
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*VL_out_ptr += VL_dotProductVector[i];
}
} }
}
lv_8sc_t bb_signal_sample; lv_8sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points%8; ++i) for(unsigned int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -780,16 +780,16 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t*
*VL_out = 0; *VL_out = 0;
// perform very early, Early, Prompt, Late and very late correlation // perform very early, Early, Prompt, Late and very late correlation
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between 7 and 7 and "XX_code inputs" must be values between 127 and 127. * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between 7 and 7 and "XX_code inputs" must be values between 127 and 127.
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that performs the carrier wipe-off mixing and the * Volk protokernel that performs the carrier wipe-off mixing and the
@ -47,7 +47,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -142,175 +142,175 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32f
imag_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
x_abs = _mm_abs_epi8 (x);
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{ {
VE_out_real += real_VE_dotProductVector[i]; for(int number = 0;number < sse_iters; number++)
VE_out_imag += imag_VE_dotProductVector[i]; {
E_out_real += real_E_dotProductVector[i]; //Perform the carrier wipe-off
E_out_imag += imag_E_dotProductVector[i]; x = _mm_lddqu_si128((__m128i*)input_ptr);
P_out_real += real_P_dotProductVector[i]; y = _mm_lddqu_si128((__m128i*)carrier_ptr);
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i]; x_abs = _mm_abs_epi8 (x);
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i]; CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
VL_out_imag += imag_VL_dotProductVector[i];
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
} }
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
if(num_points%8!=0) if(num_points%8!=0)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t VE_code_value;
lv_16sc_t E_code_value;
lv_16sc_t P_code_value;
lv_16sc_t L_code_value;
lv_16sc_t VL_code_value;
for(int i=0; i < num_points%8; ++i)
{ {
VE_code_value = *VE_code_ptr++; lv_16sc_t bb_signal_sample;
E_code_value = *E_code_ptr++; lv_16sc_t VE_code_value;
P_code_value = *P_code_ptr++; lv_16sc_t E_code_value;
L_code_value = *L_code_ptr++; lv_16sc_t P_code_value;
VL_code_value = *VL_code_ptr++; lv_16sc_t L_code_value;
lv_16sc_t VL_code_value;
if(lv_creal(VE_code_value) == -128) for(int i=0; i < num_points%8; ++i)
{ {
VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); VE_code_value = *VE_code_ptr++;
} E_code_value = *E_code_ptr++;
if(lv_cimag(VE_code_value) == -128) P_code_value = *P_code_ptr++;
{ L_code_value = *L_code_ptr++;
VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); VL_code_value = *VL_code_ptr++;
}
if(lv_creal(E_code_value) == -128) if(lv_creal(VE_code_value) == -128)
{ {
E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
} }
if(lv_cimag(E_code_value) == -128) if(lv_cimag(VE_code_value) == -128)
{ {
E_code_value = lv_cmake(lv_creal(E_code_value), -127); VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
} }
if(lv_creal(P_code_value) == -128) if(lv_creal(E_code_value) == -128)
{ {
P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
} }
if(lv_cimag(P_code_value) == -128) if(lv_cimag(E_code_value) == -128)
{ {
P_code_value = lv_cmake(lv_creal(P_code_value), -127); E_code_value = lv_cmake(lv_creal(E_code_value), -127);
} }
if(lv_creal(L_code_value) == -128) if(lv_creal(P_code_value) == -128)
{ {
L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
} }
if(lv_cimag(L_code_value) == -128) if(lv_cimag(P_code_value) == -128)
{ {
L_code_value = lv_cmake(lv_creal(L_code_value), -127); P_code_value = lv_cmake(lv_creal(P_code_value), -127);
} }
//Perform the carrier wipe-off if(lv_creal(L_code_value) == -128)
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); {
// Now get very early, early, prompt, late and very late values for each L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); }
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); if(lv_cimag(L_code_value) == -128)
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); {
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); L_code_value = lv_cmake(lv_creal(L_code_value), -127);
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); }
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
}
} }
}
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -348,67 +348,67 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc
lv_16sc_t bb_signal_sample; lv_16sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
VE_code_value = VE_code[i]; VE_code_value = VE_code[i];
E_code_value = E_code[i]; E_code_value = E_code[i];
P_code_value = P_code[i]; P_code_value = P_code[i];
L_code_value = L_code[i]; L_code_value = L_code[i];
VL_code_value = VL_code[i]; VL_code_value = VL_code[i];
if(lv_creal(VE_code_value) == -128) if(lv_creal(VE_code_value) == -128)
{ {
VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
} }
if(lv_cimag(VE_code_value) == -128) if(lv_cimag(VE_code_value) == -128)
{ {
VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
} }
if(lv_creal(E_code_value) == -128) if(lv_creal(E_code_value) == -128)
{ {
E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
} }
if(lv_cimag(E_code_value) == -128) if(lv_cimag(E_code_value) == -128)
{ {
E_code_value = lv_cmake(lv_creal(E_code_value), -127); E_code_value = lv_cmake(lv_creal(E_code_value), -127);
} }
if(lv_creal(P_code_value) == -128) if(lv_creal(P_code_value) == -128)
{ {
P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
} }
if(lv_cimag(P_code_value) == -128) if(lv_cimag(P_code_value) == -128)
{ {
P_code_value = lv_cmake(lv_creal(P_code_value), -127); P_code_value = lv_cmake(lv_creal(P_code_value), -127);
} }
if(lv_creal(L_code_value) == -128) if(lv_creal(L_code_value) == -128)
{ {
L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
} }
if(lv_cimag(L_code_value) == -128) if(lv_cimag(L_code_value) == -128)
{ {
L_code_value = lv_cmake(lv_creal(L_code_value), -127); L_code_value = lv_cmake(lv_creal(L_code_value), -127);
} }
if(lv_creal(VL_code_value) == -128) if(lv_creal(VL_code_value) == -128)
{ {
VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
} }
if(lv_cimag(VL_code_value) == -128) if(lv_cimag(VL_code_value) == -128)
{ {
VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
} }
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */
@ -496,175 +496,175 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32f
imag_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
x_abs = _mm_abs_epi8 (x);
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{ {
VE_out_real += real_VE_dotProductVector[i]; for(int number = 0;number < sse_iters; number++)
VE_out_imag += imag_VE_dotProductVector[i]; {
E_out_real += real_E_dotProductVector[i]; //Perform the carrier wipe-off
E_out_imag += imag_E_dotProductVector[i]; x = _mm_load_si128((__m128i*)input_ptr);
P_out_real += real_P_dotProductVector[i]; y = _mm_load_si128((__m128i*)carrier_ptr);
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i]; x_abs = _mm_abs_epi8 (x);
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i]; CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
VL_out_imag += imag_VL_dotProductVector[i];
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
} }
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
if(num_points%8!=0) if(num_points%8!=0)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t VE_code_value;
lv_16sc_t E_code_value;
lv_16sc_t P_code_value;
lv_16sc_t L_code_value;
lv_16sc_t VL_code_value;
for(int i=0; i < num_points%8; ++i)
{ {
VE_code_value = *VE_code_ptr++; lv_16sc_t bb_signal_sample;
E_code_value = *E_code_ptr++; lv_16sc_t VE_code_value;
P_code_value = *P_code_ptr++; lv_16sc_t E_code_value;
L_code_value = *L_code_ptr++; lv_16sc_t P_code_value;
VL_code_value = *VL_code_ptr++; lv_16sc_t L_code_value;
lv_16sc_t VL_code_value;
if(lv_creal(VE_code_value) == -128) for(int i=0; i < num_points%8; ++i)
{ {
VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); VE_code_value = *VE_code_ptr++;
} E_code_value = *E_code_ptr++;
if(lv_cimag(VE_code_value) == -128) P_code_value = *P_code_ptr++;
{ L_code_value = *L_code_ptr++;
VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); VL_code_value = *VL_code_ptr++;
}
if(lv_creal(E_code_value) == -128) if(lv_creal(VE_code_value) == -128)
{ {
E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
} }
if(lv_cimag(E_code_value) == -128) if(lv_cimag(VE_code_value) == -128)
{ {
E_code_value = lv_cmake(lv_creal(E_code_value), -127); VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
} }
if(lv_creal(P_code_value) == -128) if(lv_creal(E_code_value) == -128)
{ {
P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
} }
if(lv_cimag(P_code_value) == -128) if(lv_cimag(E_code_value) == -128)
{ {
P_code_value = lv_cmake(lv_creal(P_code_value), -127); E_code_value = lv_cmake(lv_creal(E_code_value), -127);
} }
if(lv_creal(L_code_value) == -128) if(lv_creal(P_code_value) == -128)
{ {
L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
} }
if(lv_cimag(L_code_value) == -128) if(lv_cimag(P_code_value) == -128)
{ {
L_code_value = lv_cmake(lv_creal(L_code_value), -127); P_code_value = lv_cmake(lv_creal(P_code_value), -127);
} }
//Perform the carrier wipe-off if(lv_creal(L_code_value) == -128)
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); {
// Now get very early, early, prompt, late and very late values for each L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); }
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); if(lv_cimag(L_code_value) == -128)
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); {
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); L_code_value = lv_cmake(lv_creal(L_code_value), -127);
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); }
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
}
} }
}
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -702,67 +702,67 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32
lv_16sc_t bb_signal_sample; lv_16sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
VE_code_value = VE_code[i]; VE_code_value = VE_code[i];
E_code_value = E_code[i]; E_code_value = E_code[i];
P_code_value = P_code[i]; P_code_value = P_code[i];
L_code_value = L_code[i]; L_code_value = L_code[i];
VL_code_value = VL_code[i]; VL_code_value = VL_code[i];
if(lv_creal(VE_code_value) == -128) if(lv_creal(VE_code_value) == -128)
{ {
VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
} }
if(lv_cimag(VE_code_value) == -128) if(lv_cimag(VE_code_value) == -128)
{ {
VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
} }
if(lv_creal(E_code_value) == -128) if(lv_creal(E_code_value) == -128)
{ {
E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
} }
if(lv_cimag(E_code_value) == -128) if(lv_cimag(E_code_value) == -128)
{ {
E_code_value = lv_cmake(lv_creal(E_code_value), -127); E_code_value = lv_cmake(lv_creal(E_code_value), -127);
} }
if(lv_creal(P_code_value) == -128) if(lv_creal(P_code_value) == -128)
{ {
P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
} }
if(lv_cimag(P_code_value) == -128) if(lv_cimag(P_code_value) == -128)
{ {
P_code_value = lv_cmake(lv_creal(P_code_value), -127); P_code_value = lv_cmake(lv_creal(P_code_value), -127);
} }
if(lv_creal(L_code_value) == -128) if(lv_creal(L_code_value) == -128)
{ {
L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
} }
if(lv_cimag(L_code_value) == -128) if(lv_cimag(L_code_value) == -128)
{ {
L_code_value = lv_cmake(lv_creal(L_code_value), -127); L_code_value = lv_cmake(lv_creal(L_code_value), -127);
} }
if(lv_creal(VL_code_value) == -128) if(lv_creal(VL_code_value) == -128)
{ {
VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
} }
if(lv_cimag(VL_code_value) == -128) if(lv_cimag(VL_code_value) == -128)
{ {
VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
} }
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between 7 and 7 and "XX_code inputs" must be values between 127 and 127. * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between 7 and 7 and "XX_code inputs" must be values between 127 and 127.
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that performs the carrier wipe-off mixing and the * Volk protokernel that performs the carrier wipe-off mixing and the
@ -47,7 +47,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -140,124 +140,124 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_3
imag_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
x_abs = _mm_abs_epi8 (x);
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{ {
VE_out_real += real_VE_dotProductVector[i]; for(int number = 0;number < sse_iters; number++)
VE_out_imag += imag_VE_dotProductVector[i]; {
E_out_real += real_E_dotProductVector[i]; //Perform the carrier wipe-off
E_out_imag += imag_E_dotProductVector[i]; x = _mm_lddqu_si128((__m128i*)input_ptr);
P_out_real += real_P_dotProductVector[i]; y = _mm_lddqu_si128((__m128i*)carrier_ptr);
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i]; x_abs = _mm_abs_epi8 (x);
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i]; CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
VL_out_imag += imag_VL_dotProductVector[i];
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
} }
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
lv_16sc_t bb_signal_sample; lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i) for(int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -290,16 +290,16 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32
lv_16sc_t bb_signal_sample; lv_16sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */
@ -385,124 +385,124 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_3
imag_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0) if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
x_abs = _mm_abs_epi8 (x);
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{ {
VE_out_real += real_VE_dotProductVector[i]; for(int number = 0;number < sse_iters; number++)
VE_out_imag += imag_VE_dotProductVector[i]; {
E_out_real += real_E_dotProductVector[i]; //Perform the carrier wipe-off
E_out_imag += imag_E_dotProductVector[i]; x = _mm_load_si128((__m128i*)input_ptr);
P_out_real += real_P_dotProductVector[i]; y = _mm_load_si128((__m128i*)carrier_ptr);
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i]; x_abs = _mm_abs_epi8 (x);
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i]; CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
VL_out_imag += imag_VL_dotProductVector[i];
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
} }
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
lv_16sc_t bb_signal_sample; lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i) for(int i=0; i < num_points%8; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
} }
} }
#endif /* LV_HAVE_SSE4_1 */ #endif /* LV_HAVE_SSE4_1 */
@ -535,16 +535,16 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_
lv_16sc_t bb_signal_sample; lv_16sc_t bb_signal_sample;
for(unsigned int i=0; i < num_points; ++i) for(unsigned int i=0; i < num_points; ++i)
{ {
//Perform the carrier wipe-off //Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i]; bb_signal_sample = input[i] * carrier[i];
// Now get very early, early, prompt, late and very late values for each // Now get very early, early, prompt, late and very late values for each
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */

View File

@ -2,7 +2,7 @@
* \file volk_gnsssdr_8u_x2_multiply_8u.h * \file volk_gnsssdr_8u_x2_multiply_8u.h
* \brief Volk protokernel: multiplies unsigned char values * \brief Volk protokernel: multiplies unsigned char values
* \authors <ul> * \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com * <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul> * </ul>
* *
* Volk protokernel that multiplies unsigned char values (8 bits data) * Volk protokernel that multiplies unsigned char values (8 bits data)
@ -19,7 +19,7 @@
* GNSS-SDR is free software: you can redistribute it and/or modify * GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by * it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or * the Free Software Foundation, either version 3 of the License, or
* at your option) any later version. * (at your option) any later version.
* *
* GNSS-SDR is distributed in the hope that it will be useful, * GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
@ -41,14 +41,14 @@
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
/*! /*!
\brief Multiplies the two input unsigned char values and stores their results in the third unisgned char \brief Multiplies the two input unsigned char values and stores their results in the third unsigned char
\param cChar The unsigned char where the results will be stored \param cChar The unsigned char where the results will be stored
\param aChar One of the unsigned char to be multiplied \param aChar One of the unsigned char to be multiplied
\param bChar One of the unsigned char to be multiplied \param bChar One of the unsigned char to be multiplied
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/ */
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
__m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
@ -56,38 +56,39 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
const unsigned char* a = aChar; const unsigned char* a = aChar;
const unsigned char* b = bChar; const unsigned char* b = bChar;
for(unsigned int number = 0;number < sse_iters; number++){ for(unsigned int number = 0;number < sse_iters; number++)
x = _mm_lddqu_si128((__m128i*)a); {
y = _mm_lddqu_si128((__m128i*)b); x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
x1 = _mm_srli_si128 (x, 1); x1 = _mm_srli_si128 (x, 1);
x1 = _mm_and_si128 (x1, mult1); x1 = _mm_and_si128 (x1, mult1);
x2 = _mm_and_si128 (x, mult1); x2 = _mm_and_si128 (x, mult1);
y1 = _mm_srli_si128 (y, 1); y1 = _mm_srli_si128 (y, 1);
y1 = _mm_and_si128 (y1, mult1); y1 = _mm_and_si128 (y1, mult1);
y2 = _mm_and_si128 (y, mult1); y2 = _mm_and_si128 (y, mult1);
x1_mult_y1 = _mm_mullo_epi16 (x1, y1); x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
x2_mult_y2 = _mm_mullo_epi16 (x2, y2); x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
tmp = _mm_and_si128 (x1_mult_y1, mult1); tmp = _mm_and_si128 (x1_mult_y1, mult1);
tmp1 = _mm_slli_si128 (tmp, 1); tmp1 = _mm_slli_si128 (tmp, 1);
tmp2 = _mm_and_si128 (x2_mult_y2, mult1); tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
totalc = _mm_or_si128 (tmp1, tmp2); totalc = _mm_or_si128 (tmp1, tmp2);
_mm_storeu_si128((__m128i*)c, totalc); _mm_storeu_si128((__m128i*)c, totalc);
a += 16; a += 16;
b += 16; b += 16;
c += 16; c += 16;
} }
for (unsigned int i = 0; i<(num_points % 16); ++i) for (unsigned int i = 0; i<(num_points % 16); ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -99,14 +100,16 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
\param bChar One of the unsigned char to be multiplied \param bChar One of the unsigned char to be multiplied
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/ */
static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
unsigned char* cPtr = cChar; unsigned char* cPtr = cChar;
const unsigned char* aPtr = aChar; const unsigned char* aPtr = aChar;
const unsigned char* bPtr = bChar; const unsigned char* bPtr = bChar;
for(unsigned int number = 0; number < num_points; number++){ for(unsigned int number = 0; number < num_points; number++)
*cPtr++ = (*aPtr++) * (*bPtr++); {
} *cPtr++ = (*aPtr++) * (*bPtr++);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -129,8 +132,8 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar,
\param bChar One of the unsigned char to be multiplied \param bChar One of the unsigned char to be multiplied
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/ */
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
__m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
@ -138,38 +141,39 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
const unsigned char* a = aChar; const unsigned char* a = aChar;
const unsigned char* b = bChar; const unsigned char* b = bChar;
for(unsigned int number = 0;number < sse_iters; number++){ for(unsigned int number = 0;number < sse_iters; number++)
x = _mm_load_si128((__m128i*)a); {
y = _mm_load_si128((__m128i*)b); x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
x1 = _mm_srli_si128 (x, 1); x1 = _mm_srli_si128 (x, 1);
x1 = _mm_and_si128 (x1, mult1); x1 = _mm_and_si128 (x1, mult1);
x2 = _mm_and_si128 (x, mult1); x2 = _mm_and_si128 (x, mult1);
y1 = _mm_srli_si128 (y, 1); y1 = _mm_srli_si128 (y, 1);
y1 = _mm_and_si128 (y1, mult1); y1 = _mm_and_si128 (y1, mult1);
y2 = _mm_and_si128 (y, mult1); y2 = _mm_and_si128 (y, mult1);
x1_mult_y1 = _mm_mullo_epi16 (x1, y1); x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
x2_mult_y2 = _mm_mullo_epi16 (x2, y2); x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
tmp = _mm_and_si128 (x1_mult_y1, mult1); tmp = _mm_and_si128 (x1_mult_y1, mult1);
tmp1 = _mm_slli_si128 (tmp, 1); tmp1 = _mm_slli_si128 (tmp, 1);
tmp2 = _mm_and_si128 (x2_mult_y2, mult1); tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
totalc = _mm_or_si128 (tmp1, tmp2); totalc = _mm_or_si128 (tmp1, tmp2);
_mm_store_si128((__m128i*)c, totalc); _mm_store_si128((__m128i*)c, totalc);
a += 16; a += 16;
b += 16; b += 16;
c += 16; c += 16;
} }
for (unsigned int i = 0; i<(num_points % 16); ++i) for (unsigned int i = 0; i<(num_points % 16); ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
} }
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
@ -181,14 +185,16 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
\param bChar One of the unsigned char to be multiplied \param bChar One of the unsigned char to be multiplied
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/ */
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
unsigned char* cPtr = cChar; unsigned char* cPtr = cChar;
const unsigned char* aPtr = aChar; const unsigned char* aPtr = aChar;
const unsigned char* bPtr = bChar; const unsigned char* bPtr = bChar;
for(unsigned int number = 0; number < num_points; number++){ for(unsigned int number = 0; number < num_points; number++)
*cPtr++ = (*aPtr++) * (*bPtr++); {
} *cPtr++ = (*aPtr++) * (*bPtr++);
}
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -201,7 +207,8 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/ */
extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points); extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points)
{
volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points); volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points);
} }
#endif /* LV_HAVE_ORC */ #endif /* LV_HAVE_ORC */