From 51427f046a7f6f7300d4acc8afc1480bceddbfc4 Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Sun, 9 Nov 2014 13:17:18 +0100 Subject: [PATCH] Code cleaning --- ...volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h | 466 ++-- ...gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h | 1469 +++++++------ ...olk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h | 4 +- .../volk_gnsssdr_32fc_convert_16ic.h | 266 +-- .../volk_gnsssdr_32fc_convert_8ic.h | 198 +- .../volk_gnsssdr_32fc_s32f_convert_8ic.h | 222 +- ...ssdr_32fc_s32f_x4_update_local_code_32fc.h | 280 +-- ...volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h | 372 ++-- ...olk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h | 854 +++---- .../volk_gnsssdr_64f_accumulator_64f.h | 4 +- .../volk_gnsssdr_8i_accumulator_s8i.h | 112 +- .../volk_gnsssdr_8i_index_max_16u.h | 4 +- .../volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h | 375 ++-- .../volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h | 119 +- .../volk_gnsssdr_8ic_conjugate_8ic.h | 257 +-- .../volk_gnsssdr_8ic_magnitude_squared_8i.h | 191 +- .../volk_gnsssdr_8ic_s8ic_multiply_8ic.h | 224 +- .../volk_gnsssdr_8ic_x2_dot_prod_8ic.h | 554 ++--- .../volk_gnsssdr_8ic_x2_multiply_8ic.h | 353 +-- .../volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h | 617 +++--- .../volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h | 1048 ++++----- ...volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h | 822 +++---- ...gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h | 1958 ++++++++--------- ...gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h | 932 ++++---- ...sssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h | 532 ++--- .../volk_gnsssdr_8u_x2_multiply_8u.h | 175 +- ...nsssdr_s32f_x2_update_local_carrier_32fc.h | 1185 +++++----- 27 files changed, 6822 insertions(+), 6771 deletions(-) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h index df0baf1ce..b608787b2 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors * \authors * * Volk protokernel that performs the carrier wipe-off mixing and the @@ -30,7 +30,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -56,8 +56,8 @@ #include #include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" #include "CommonMacros/CommonMacros.h" - /*! - \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + +/*! \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation \param input The input signal input \param carrier The carrier signal input \param E_code Early PRN code replica input @@ -67,142 +67,141 @@ \param P_out Early correlation output \param L_out Early correlation output \param num_points The number of complex values in vectors - */ + */ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; - + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 real_output_ps, imag_output_ps; - + float E_out_real = 0; float E_out_imag = 0; float P_out_real = 0; float P_out_imag = 0; float L_out_real = 0; float L_out_imag = 0; - + const lv_16sc_t* input_ptr = input; const lv_16sc_t* carrier_ptr = carrier; - + const lv_16sc_t* E_code_ptr = E_code; lv_32fc_t* E_out_ptr = E_out; const lv_16sc_t* L_code_ptr = L_code; lv_32fc_t* L_out_ptr = L_out; const lv_16sc_t* P_code_ptr = P_code; lv_32fc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + real_E_code_acc = _mm_setzero_ps(); imag_E_code_acc = _mm_setzero_ps(); real_P_code_acc = _mm_setzero_ps(); imag_P_code_acc = _mm_setzero_ps(); real_L_code_acc = _mm_setzero_ps(); imag_L_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x1 = _mm_lddqu_si128((__m128i*)input_ptr); - input_ptr += 4; - x2 = _mm_lddqu_si128((__m128i*)input_ptr); - - y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); - carrier_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); - - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get early values - y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); - E_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); - - CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - //Adds the float 32 results - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); - P_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); - - CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); - L_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); - - CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - input_ptr += 4; - carrier_ptr += 4; - E_code_ptr += 4; - P_code_ptr += 4; - L_code_ptr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) { - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + //Adds the float 32 results + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); } - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - } - + lv_16sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } - + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -225,30 +224,31 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E lv_16sc_t tmp1; lv_16sc_t tmp2; lv_16sc_t tmp3; - + bb_signal_sample = lv_cmake(0, 0); - + *E_out = 0; *P_out = 0; *L_out = 0; // perform Early, Prompt and Late correlation - + for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - - tmp1 = bb_signal_sample * E_code[i]; - tmp2 = bb_signal_sample * P_code[i]; - tmp3 = bb_signal_sample * L_code[i]; - - // Now get early, late, and prompt values for each - *E_out += (lv_32fc_t)tmp1; - *P_out += (lv_32fc_t)tmp2; - *L_out += (lv_32fc_t)tmp3; - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } } #endif /* LV_HAVE_GENERIC */ + #endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */ @@ -280,138 +280,137 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; - + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 real_output_ps, imag_output_ps; - + float E_out_real = 0; float E_out_imag = 0; float P_out_real = 0; float P_out_imag = 0; float L_out_real = 0; float L_out_imag = 0; - + const lv_16sc_t* input_ptr = input; const lv_16sc_t* carrier_ptr = carrier; - + const lv_16sc_t* E_code_ptr = E_code; lv_32fc_t* E_out_ptr = E_out; const lv_16sc_t* L_code_ptr = L_code; lv_32fc_t* L_out_ptr = L_out; const lv_16sc_t* P_code_ptr = P_code; lv_32fc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + real_E_code_acc = _mm_setzero_ps(); imag_E_code_acc = _mm_setzero_ps(); real_P_code_acc = _mm_setzero_ps(); imag_P_code_acc = _mm_setzero_ps(); real_L_code_acc = _mm_setzero_ps(); imag_L_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x1 = _mm_load_si128((__m128i*)input_ptr); - input_ptr += 4; - x2 = _mm_load_si128((__m128i*)input_ptr); - - y1 = _mm_load_si128((__m128i*)carrier_ptr); - carrier_ptr += 4; - y2 = _mm_load_si128((__m128i*)carrier_ptr); - - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get early values - y1 = _mm_load_si128((__m128i*)E_code_ptr); - E_code_ptr += 4; - y2 = _mm_load_si128((__m128i*)E_code_ptr); - - CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - //Adds the float 32 results - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y1 = _mm_load_si128((__m128i*)P_code_ptr); - P_code_ptr += 4; - y2 = _mm_load_si128((__m128i*)P_code_ptr); - - CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y1 = _mm_load_si128((__m128i*)L_code_ptr); - L_code_ptr += 4; - y2 = _mm_load_si128((__m128i*)L_code_ptr); - - CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - input_ptr += 4; - carrier_ptr += 4; - E_code_ptr += 4; - P_code_ptr += 4; - L_code_ptr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - - _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) { - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x1 = _mm_load_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_load_si128((__m128i*)input_ptr); + + y1 = _mm_load_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_load_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_load_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + //Adds the float 32 results + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_load_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_load_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); } - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - } - + lv_16sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } - + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -434,28 +433,29 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* lv_16sc_t tmp1; lv_16sc_t tmp2; lv_16sc_t tmp3; - + bb_signal_sample = lv_cmake(0, 0); - + *E_out = 0; *P_out = 0; *L_out = 0; // perform Early, Prompt and Late correlation - + for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - - tmp1 = bb_signal_sample * E_code[i]; - tmp2 = bb_signal_sample * P_code[i]; - tmp3 = bb_signal_sample * L_code[i]; - - // Now get early, late, and prompt values for each - *E_out += (lv_32fc_t)tmp1; - *P_out += (lv_32fc_t)tmp2; - *L_out += (lv_32fc_t)tmp3; - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } } #endif /* LV_HAVE_GENERIC */ + #endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h index 4fc9209ac..2e2eb2c8d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. * \authors * * Volk protokernel that performs the carrier wipe-off mixing and the @@ -30,7 +30,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -280,185 +280,185 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_second if (sse_iters>0) { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x1 = _mm_lddqu_si128((__m128i*)input_ptr); - input_ptr += 4; - x2 = _mm_lddqu_si128((__m128i*)input_ptr); - - y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); - carrier_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); - - imagx = _mm_srli_si128 (x1, 2); - imagx = _mm_blend_epi16 (x2, imagx, 85); - realx = _mm_slli_si128 (x2, 2); - realx = _mm_blend_epi16 (realx, x1, 85); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - //Get early values - y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); - E_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); - - //Get prompt values - y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); - P_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); - - //Get late values - y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); - L_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); - - input_ptr += 4; - carrier_ptr += 4; - E_code_ptr += 4; - L_code_ptr += 4; - P_code_ptr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) - { - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; - } - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x1, 2); + imagx = _mm_blend_epi16 (x2, imagx, 85); + realx = _mm_slli_si128 (x2, 2); + realx = _mm_blend_epi16 (realx, x1, 85); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); } - + lv_16sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -481,206 +481,207 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third( const unsigned int sse_iters = num_points / 8; unsigned int index = 0; unsigned int indexPlus4 = 0; - + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; - + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; __m128 real_output_ps, imag_output_ps; - + float E_out_real = 0; float E_out_imag = 0; float P_out_real = 0; float P_out_imag = 0; float L_out_real = 0; float L_out_imag = 0; - + const lv_16sc_t* input_ptr = input; const lv_16sc_t* carrier_ptr = carrier; - + const lv_16sc_t* E_code_ptr = E_code; lv_32fc_t* E_out_ptr = E_out; const lv_16sc_t* L_code_ptr = L_code; lv_32fc_t* L_out_ptr = L_out; const lv_16sc_t* P_code_ptr = P_code; lv_32fc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + real_E_code_acc = _mm_setzero_ps(); imag_E_code_acc = _mm_setzero_ps(); real_P_code_acc = _mm_setzero_ps(); imag_P_code_acc = _mm_setzero_ps(); real_L_code_acc = _mm_setzero_ps(); imag_L_code_acc = _mm_setzero_ps(); - - if (sse_iters>0) - { - for(index = 0;index < 8*sse_iters; index+=8){ - indexPlus4 = index + 4; - //Perform the carrier wipe-off - x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]); - x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]); - - y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]); - y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]); - - imagx = _mm_srli_si128 (x1, 2); - imagx = _mm_blend_epi16 (x2, imagx, 85); - realx = _mm_slli_si128 (x2, 2); - realx = _mm_blend_epi16 (realx, x1, 85); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - //Get early values - y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]); - y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]); - y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - //Get late values - y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]); - y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - } - - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) + if (sse_iters>0) { - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; + for(index = 0;index < 8*sse_iters; index+=8) + { + indexPlus4 = index + 4; + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]); + x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]); + + y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]); + + imagx = _mm_srli_si128 (x1, 2); + imagx = _mm_blend_epi16 (x2, imagx, 85); + realx = _mm_slli_si128 (x2, 2); + realx = _mm_blend_epi16 (realx, x1, 85); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); } - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - } - + lv_16sc_t bb_signal_sample; for(; index < num_points; index++) - { - //Perform the carrier wipe-off - bb_signal_sample = input_ptr[index] * carrier_ptr[index]; - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]); - } + { + //Perform the carrier wipe-off + bb_signal_sample = input_ptr[index] * carrier_ptr[index]; + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -701,217 +702,217 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third( static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; - + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; __m128 real_output_ps, imag_output_ps; - + float E_out_real = 0; float E_out_imag = 0; float P_out_real = 0; float P_out_imag = 0; float L_out_real = 0; float L_out_imag = 0; - + const lv_16sc_t* input_ptr = input; const lv_16sc_t* carrier_ptr = carrier; - + const lv_16sc_t* E_code_ptr = E_code; lv_32fc_t* E_out_ptr = E_out; const lv_16sc_t* L_code_ptr = L_code; lv_32fc_t* L_out_ptr = L_out; const lv_16sc_t* P_code_ptr = P_code; lv_32fc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + real_E_code_acc = _mm_setzero_ps(); imag_E_code_acc = _mm_setzero_ps(); real_P_code_acc = _mm_setzero_ps(); imag_P_code_acc = _mm_setzero_ps(); real_L_code_acc = _mm_setzero_ps(); imag_L_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x1 = _mm_lddqu_si128((__m128i*)input_ptr); - input_ptr += 4; - x2 = _mm_lddqu_si128((__m128i*)input_ptr); - - y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); - carrier_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); - - imagx = _mm_srli_si128 (x1, 2); - imagx = _mm_blend_epi16 (x2, imagx, 85); - realx = _mm_slli_si128 (x2, 2); - realx = _mm_blend_epi16 (realx, x1, 85); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - //Get early values - y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); - E_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); - P_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); - L_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); - - imagy = _mm_srli_si128 (y1, 2); - imagy = _mm_blend_epi16 (y2, imagy, 85); - realy = _mm_slli_si128 (y2, 2); - realy = _mm_blend_epi16 (realy, y1, 85); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - input_ptr += 4; - carrier_ptr += 4; - E_code_ptr += 4; - L_code_ptr += 4; - P_code_ptr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) { - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x1, 2); + imagx = _mm_blend_epi16 (x2, imagx, 85); + realx = _mm_slli_si128 (x2, 2); + realx = _mm_blend_epi16 (realx, x1, 85); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); } - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - } - + lv_16sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -934,148 +935,148 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fifth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; __m128i input_i_1, input_i_2, output_i32; - + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; __m128i realx, imagx, realy, imagy, real_output, imag_output; - + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; __m128 real_output_ps, imag_output_ps; - + float E_out_real = 0; float E_out_imag = 0; float P_out_real = 0; float P_out_imag = 0; float L_out_real = 0; float L_out_imag = 0; - + const lv_16sc_t* input_ptr = input; const lv_16sc_t* carrier_ptr = carrier; - + const lv_16sc_t* E_code_ptr = E_code; lv_32fc_t* E_out_ptr = E_out; const lv_16sc_t* L_code_ptr = L_code; lv_32fc_t* L_out_ptr = L_out; const lv_16sc_t* P_code_ptr = P_code; lv_32fc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + real_E_code_acc = _mm_setzero_ps(); imag_E_code_acc = _mm_setzero_ps(); real_P_code_acc = _mm_setzero_ps(); imag_P_code_acc = _mm_setzero_ps(); real_L_code_acc = _mm_setzero_ps(); imag_L_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x1 = _mm_lddqu_si128((__m128i*)input_ptr); - input_ptr += 4; - x2 = _mm_lddqu_si128((__m128i*)input_ptr); - - y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); - carrier_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); - - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get early values - y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); - E_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); - - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) - - CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) - CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); - P_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); - - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) - - CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) - CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); - L_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); - - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) - - CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) - CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - input_ptr += 4; - carrier_ptr += 4; - E_code_ptr += 4; - L_code_ptr += 4; - P_code_ptr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) { - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); } - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - } - + lv_16sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -1138,97 +1139,97 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_sixth( imag_L_code_acc = _mm_setzero_ps(); if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x1 = _mm_lddqu_si128((__m128i*)input_ptr); - input_ptr += 4; - x2 = _mm_lddqu_si128((__m128i*)input_ptr); - - y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); - carrier_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); - - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) - CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get early values - y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); - E_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); - - CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); - P_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); - - CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); - L_code_ptr += 4; - y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); - - CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - input_ptr += 4; - carrier_ptr += 4; - E_code_ptr += 4; - L_code_ptr += 4; - P_code_ptr += 4; - } - - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) { - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); } - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - } - + lv_16sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -1251,28 +1252,28 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_generic(lv_32fc lv_16sc_t tmp1; lv_16sc_t tmp2; lv_16sc_t tmp3; - + bb_signal_sample = lv_cmake(0, 0); - + *E_out = 0; *P_out = 0; *L_out = 0; // perform Early, Prompt and Late correlation - + for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - - tmp1 = bb_signal_sample * E_code[i]; - tmp2 = bb_signal_sample * P_code[i]; - tmp3 = bb_signal_sample * L_code[i]; - - // Now get early, late, and prompt values for each - *E_out += (lv_32fc_t)tmp1; - *P_out += (lv_32fc_t)tmp2; - *L_out += (lv_32fc_t)tmp3; - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h index 07c2cfe51..6e8c68008 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values. * \authors * * Volk protokernel that performs the carrier wipe-off mixing and the @@ -34,7 +34,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h index ade8b6763..181a178c0 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_32fc_convert_16ic.h * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow * \authors * * ------------------------------------------------------------------------- @@ -17,7 +17,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -47,43 +47,44 @@ */ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ const unsigned int sse_iters = num_points/4; - + float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; - + float min_val = -32768; float max_val = 32767; - + __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; __m128 ret1, ret2; __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - + for(unsigned int i = 0;i < sse_iters; i++){ - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - for(unsigned int i = 0; i < (num_points%4)*2; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; } + + for(unsigned int i = 0; i < (num_points%4)*2; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_SSE2 */ @@ -97,43 +98,44 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector */ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ const unsigned int sse_iters = num_points/4; - + float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; - + float min_val = -32768; float max_val = 32767; - + __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; __m128 ret1, ret2; __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - + for(unsigned int i = 0;i < sse_iters; i++){ - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - for(unsigned int i = 0; i < (num_points%4)*2; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; } + + for(unsigned int i = 0; i < (num_points%4)*2; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_SSE */ @@ -149,14 +151,15 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto int16_t* outputVectorPtr = (int16_t*)outputVector; float min_val = -32768; float max_val = 32767; - - for(unsigned int i = 0; i < num_points*2; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); - } + + for(unsigned int i = 0; i < num_points*2; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */ @@ -180,43 +183,45 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto */ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ const unsigned int sse_iters = num_points/4; - + float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; - + float min_val = -32768; float max_val = 32767; - + __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; __m128 ret1, ret2; __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - - for(unsigned int i = 0;i < sse_iters; i++){ - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - for(unsigned int i = 0; i < (num_points%4)*2; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); - } + + for(unsigned int i = 0;i < sse_iters; i++) + { + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_SSE2 */ @@ -228,45 +233,48 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points/4; - + float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; - + float min_val = -32768; float max_val = 32767; - + __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; __m128 ret1, ret2; __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - - for(unsigned int i = 0;i < sse_iters; i++){ - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - for(unsigned int i = 0; i < (num_points%4)*2; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); - } + + for(unsigned int i = 0;i < sse_iters; i++) + { + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_SSE */ @@ -277,19 +285,21 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; float min_val = -32768; float max_val = 32767; - - for(unsigned int i = 0; i < num_points*2; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); - } + + for(unsigned int i = 0; i < num_points*2; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h index 5a97b4827..637f3aee4 100755 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_32fc_convert_8ic.h * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow * \authors * * ------------------------------------------------------------------------- @@ -17,7 +17,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -45,54 +45,57 @@ \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points/8; - + float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; - + float min_val = -128; float max_val = 127; - + __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; __m128i int8InputVal; __m128 ret1, ret2, ret3, ret4; __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - - for(unsigned int i = 0;i < sse_iters; i++){ - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); - ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - intInputVal3 = _mm_cvtps_epi32(ret3); - intInputVal4 = _mm_cvtps_epi32(ret4); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); - int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); - outputVectorPtr += 16; - } - - for(unsigned int i = 0; i < (num_points%4)*4; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); - } + + for(unsigned int i = 0;i < sse_iters; i++) + { + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal3 = _mm_cvtps_epi32(ret3); + intInputVal4 = _mm_cvtps_epi32(ret4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); + int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 16; + } + + for(unsigned int i = 0; i < (num_points%4)*4; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_SSE2 */ @@ -103,19 +106,21 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; float min_val = -128; float max_val = 127; - - for(unsigned int i = 0; i < num_points*2; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); - } + + for(unsigned int i = 0; i < num_points*2; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */ @@ -137,54 +142,57 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points/8; - + float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; - + float min_val = -128; float max_val = 127; - + __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; __m128i int8InputVal; __m128 ret1, ret2, ret3, ret4; __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - - for(unsigned int i = 0;i < sse_iters; i++){ - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); - ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - intInputVal3 = _mm_cvtps_epi32(ret3); - intInputVal4 = _mm_cvtps_epi32(ret4); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); - int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); - outputVectorPtr += 16; - } - - for(unsigned int i = 0; i < (num_points%4)*4; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); - } + + for(unsigned int i = 0;i < sse_iters; i++) + { + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal3 = _mm_cvtps_epi32(ret3); + intInputVal4 = _mm_cvtps_epi32(ret4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); + int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 16; + } + + for(unsigned int i = 0; i < (num_points%4)*4; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_SSE2 */ @@ -195,19 +203,21 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; float min_val = -128; float max_val = 127; - - for(unsigned int i = 0; i < num_points*2; i++){ - if(inputVectorPtr[i] > max_val) - inputVectorPtr[i] = max_val; - else if(inputVectorPtr[i] < min_val) - inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); - } + + for(unsigned int i = 0; i < num_points*2; i++) + { + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h index 9c33c9870..e24e2bd74 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_32fc_s32f_convert_8ic.h * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow * \authors * * ------------------------------------------------------------------------- @@ -17,7 +17,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -45,61 +45,64 @@ \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points) +{ const unsigned int sse_iters = num_points/8; - + float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); - + float min_val = -128; float max_val = 127; - + __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; __m128i int8InputVal; __m128 ret1, ret2, ret3, ret4; __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - - for(unsigned int i = 0;i < sse_iters; i++){ - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_mul_ps(inputVal1, invScalar); - inputVal2 = _mm_mul_ps(inputVal2, invScalar); - inputVal3 = _mm_mul_ps(inputVal3, invScalar); - inputVal4 = _mm_mul_ps(inputVal4, invScalar); - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); - ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - intInputVal3 = _mm_cvtps_epi32(ret3); - intInputVal4 = _mm_cvtps_epi32(ret4); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); - int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); - outputVectorPtr += 16; - } - + + for(unsigned int i = 0;i < sse_iters; i++) + { + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + inputVal1 = _mm_mul_ps(inputVal1, invScalar); + inputVal2 = _mm_mul_ps(inputVal2, invScalar); + inputVal3 = _mm_mul_ps(inputVal3, invScalar); + inputVal4 = _mm_mul_ps(inputVal4, invScalar); + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal3 = _mm_cvtps_epi32(ret3); + intInputVal4 = _mm_cvtps_epi32(ret4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); + int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 16; + } + float scaled = 0; - for(unsigned int i = 0; i < (num_points%4)*4; i++){ - scaled = inputVectorPtr[i]/scalar; - if(scaled > max_val) - scaled = max_val; - else if(scaled < min_val) - scaled = min_val; - outputVectorPtr[i] = (int8_t)rintf(scaled); - } + for(unsigned int i = 0; i < (num_points%4)*4; i++) + { + scaled = inputVectorPtr[i]/scalar; + if(scaled > max_val) + scaled = max_val; + else if(scaled < min_val) + scaled = min_val; + outputVectorPtr[i] = (int8_t)rintf(scaled); + } } #endif /* LV_HAVE_SSE2 */ @@ -110,21 +113,23 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVec \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points) +{ float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; float scaled = 0; float min_val = -128; float max_val = 127; - - for(unsigned int i = 0; i < num_points*2; i++){ - scaled = (inputVectorPtr[i])/scalar; - if(scaled > max_val) - scaled = max_val; - else if(scaled < min_val) - scaled = min_val; - outputVectorPtr[i] = (int8_t)rintf(scaled); - } + + for(unsigned int i = 0; i < num_points*2; i++) + { + scaled = (inputVectorPtr[i])/scalar; + if(scaled > max_val) + scaled = max_val; + else if(scaled < min_val) + scaled = min_val; + outputVectorPtr[i] = (int8_t)rintf(scaled); + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */ @@ -146,61 +151,64 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVe \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points) +{ const unsigned int sse_iters = num_points/8; - + float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; __m128 invScalar = _mm_set_ps1(1.0/scalar); - + float min_val = -128; float max_val = 127; - + __m128 inputVal1, inputVal2, inputVal3, inputVal4; __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; __m128i int8InputVal; __m128 ret1, ret2, ret3, ret4; __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - - for(unsigned int i = 0;i < sse_iters; i++){ - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - - inputVal1 = _mm_mul_ps(inputVal1, invScalar); - inputVal2 = _mm_mul_ps(inputVal2, invScalar); - inputVal3 = _mm_mul_ps(inputVal3, invScalar); - inputVal4 = _mm_mul_ps(inputVal4, invScalar); - // Clip - ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); - ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); - ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - intInputVal3 = _mm_cvtps_epi32(ret3); - intInputVal4 = _mm_cvtps_epi32(ret4); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); - int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); - outputVectorPtr += 16; - } - + + for(unsigned int i = 0;i < sse_iters; i++) + { + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + inputVal1 = _mm_mul_ps(inputVal1, invScalar); + inputVal2 = _mm_mul_ps(inputVal2, invScalar); + inputVal3 = _mm_mul_ps(inputVal3, invScalar); + inputVal4 = _mm_mul_ps(inputVal4, invScalar); + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal3 = _mm_cvtps_epi32(ret3); + intInputVal4 = _mm_cvtps_epi32(ret4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); + int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 16; + } + float scaled = 0; - for(unsigned int i = 0; i < (num_points%4)*4; i++){ - scaled = inputVectorPtr[i]/scalar; - if(scaled > max_val) - scaled = max_val; - else if(scaled < min_val) - scaled = min_val; - outputVectorPtr[i] = (int8_t)rintf(scaled); - } + for(unsigned int i = 0; i < (num_points%4)*4; i++) + { + scaled = inputVectorPtr[i]/scalar; + if(scaled > max_val) + scaled = max_val; + else if(scaled < min_val) + scaled = min_val; + outputVectorPtr[i] = (int8_t)rintf(scaled); + } } #endif /* LV_HAVE_SSE2 */ @@ -211,21 +219,23 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVec \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points) +{ float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; float scaled = 0; float min_val = -128; float max_val = 127; - - for(unsigned int i = 0; i < num_points*2; i++){ - scaled = inputVectorPtr[i]/scalar; - if(scaled > max_val) - scaled = max_val; - else if(scaled < min_val) - scaled = min_val; - outputVectorPtr[i] = (int8_t)rintf(scaled); - } + + for(unsigned int i = 0; i < num_points*2; i++) + { + scaled = inputVectorPtr[i]/scalar; + if(scaled > max_val) + scaled = max_val; + else if(scaled < min_val) + scaled = min_val; + outputVectorPtr[i] = (int8_t)rintf(scaled); + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h index 7c06a23a5..4c296a5dc 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc * \brief Volk protokernel: replaces the tracking function for update_local_code * \authors * * Volk protokernel that replaces the tracking function for update_local_code @@ -19,7 +19,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -43,107 +43,89 @@ #ifdef LV_HAVE_SSE4_1 #include - /*! +/*! \brief Takes the conjugate of a complex vector. \param cVector The vector where the results will be stored \param aVector Vector to be conjugated \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ - -// float* pointer1 = (float*)&d_very_early_late_spc_chips; -// *pointer1 = 1; -// float* pointer2 = (float*)&code_length_half_chips; -// *pointer2 = 6; -// float* pointer3 = (float*)&code_phase_step_half_chips; -// *pointer3 = 7; -// float* pointer4 = (float*)&tcode_half_chips_input; -// *pointer4 = 8; - + */ +static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 4; - + __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; - + __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); __m128 twos = _mm_set1_ps (2); __m128i associated_chip_index_array_int; - - __VOLK_ATTR_ALIGNED(16) int32_t output[4]; - - for (unsigned int i = 0; i < sse_iters; i++) - { - //fmod = numer - tquot * denom; tquot = numer/denom truncated - //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); - fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); - tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); - tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); - fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); - - associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); - associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); - associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); - _mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int); - - //d_very_early_code[i] = d_ca_code[associated_chip_index]; - *d_very_early_code++ = d_ca_code[output[0]]; - *d_very_early_code++ = d_ca_code[output[1]]; - *d_very_early_code++ = d_ca_code[output[2]]; - *d_very_early_code++ = d_ca_code[output[3]]; - //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; - tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); - } - - if (num_points%4!=0) - { - __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; - _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array); - - int associated_chip_index; - float tcode_half_chips = tcode_half_chips_stored[0]; - float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; - - for (unsigned int i = 0; i < num_points%4; i++) + __VOLK_ATTR_ALIGNED(16) int32_t output[4]; + + for (unsigned int i = 0; i < sse_iters; i++) + { + //fmod = numer - tquot * denom; tquot = numer/denom truncated + //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); + fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); + tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); + tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); + fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); + + associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); + associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); + associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); + _mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int); + + //d_very_early_code[i] = d_ca_code[associated_chip_index]; + *d_very_early_code++ = d_ca_code[output[0]]; + *d_very_early_code++ = d_ca_code[output[1]]; + *d_very_early_code++ = d_ca_code[output[2]]; + *d_very_early_code++ = d_ca_code[output[3]]; + + //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); + } + + if (num_points % 4 != 0) + { + __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; + _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array); + + int associated_chip_index; + float tcode_half_chips = tcode_half_chips_stored[0]; + float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; + + for (unsigned int i = 0; i < num_points%4; i++) + { + associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); + d_very_early_code[i] = d_ca_code[associated_chip_index]; + tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + } + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points) +{ + int associated_chip_index; + float tcode_half_chips = tcode_half_chips_input; + float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; + + for (unsigned int i = 0; i < num_points; i++) { associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); d_very_early_code[i] = d_ca_code[associated_chip_index]; tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; } - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Takes the conjugate of a complex vector. - \param cVector The vector where the results will be stored - \param aVector Vector to be conjugated - \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ - -// float* pointer1 = (float*)&d_very_early_late_spc_chips; -// *pointer1 = 1; -// float* pointer2 = (float*)&code_length_half_chips; -// *pointer2 = 6; -// float* pointer3 = (float*)&code_phase_step_half_chips; -// *pointer3 = 7; -// float* pointer4 = (float*)&tcode_half_chips_input; -// *pointer4 = 8; - - int associated_chip_index; - float tcode_half_chips = tcode_half_chips_input; - float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; - - for (unsigned int i = 0; i < num_points; i++) - { - associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); - d_very_early_code[i] = d_ca_code[associated_chip_index]; - tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; - } } #endif /* LV_HAVE_GENERIC */ @@ -159,108 +141,90 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_3 #ifdef LV_HAVE_SSE4_1 #include - /*! +/*! \brief Takes the conjugate of a complex vector. \param cVector The vector where the results will be stored \param aVector Vector to be conjugated \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ - - // float* pointer1 = (float*)&d_very_early_late_spc_chips; - // *pointer1 = 1; - // float* pointer2 = (float*)&code_length_half_chips; - // *pointer2 = 6; - // float* pointer3 = (float*)&code_phase_step_half_chips; - // *pointer3 = 7; - // float* pointer4 = (float*)&tcode_half_chips_input; - // *pointer4 = 8; - + */ +static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 4; - + __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; - + __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); __m128 twos = _mm_set1_ps (2); __m128i associated_chip_index_array_int; - + __VOLK_ATTR_ALIGNED(16) int32_t output[4]; - + for (unsigned int i = 0; i < sse_iters; i++) - { - //fmod = numer - tquot * denom; tquot = numer/denom truncated - //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); - fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); - tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); - tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); - fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); - - associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); - associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); - associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); - _mm_store_si128 ((__m128i*)output, associated_chip_index_array_int); - - //d_very_early_code[i] = d_ca_code[associated_chip_index]; - *d_very_early_code++ = d_ca_code[output[0]]; - *d_very_early_code++ = d_ca_code[output[1]]; - *d_very_early_code++ = d_ca_code[output[2]]; - *d_very_early_code++ = d_ca_code[output[3]]; - - //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; - tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); - } - - if (num_points%4!=0) - { - __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; - _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array); - - int associated_chip_index; - float tcode_half_chips = tcode_half_chips_stored[0]; - float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; - - for (unsigned int i = 0; i < num_points%4; i++) { - associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); - d_very_early_code[i] = d_ca_code[associated_chip_index]; - tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + //fmod = numer - tquot * denom; tquot = numer/denom truncated + //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); + fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); + tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); + tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); + fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); + + associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); + associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); + associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); + _mm_store_si128 ((__m128i*)output, associated_chip_index_array_int); + + //d_very_early_code[i] = d_ca_code[associated_chip_index]; + *d_very_early_code++ = d_ca_code[output[0]]; + *d_very_early_code++ = d_ca_code[output[1]]; + *d_very_early_code++ = d_ca_code[output[2]]; + *d_very_early_code++ = d_ca_code[output[3]]; + + //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); + } + + if (num_points % 4 != 0) + { + __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; + _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array); + + int associated_chip_index; + float tcode_half_chips = tcode_half_chips_stored[0]; + float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; + + for (unsigned int i = 0; i < num_points%4; i++) + { + associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); + d_very_early_code[i] = d_ca_code[associated_chip_index]; + tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + } } - } } #endif /* LV_HAVE_SSE4_1 */ #ifdef LV_HAVE_GENERIC - /*! +/*! \brief Takes the conjugate of a complex vector. \param cVector The vector where the results will be stored \param aVector Vector to be conjugated \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ - - // float* pointer1 = (float*)&d_very_early_late_spc_chips; - // *pointer1 = 1; - // float* pointer2 = (float*)&code_length_half_chips; - // *pointer2 = 6; - // float* pointer3 = (float*)&code_phase_step_half_chips; - // *pointer3 = 7; - // float* pointer4 = (float*)&tcode_half_chips_input; - // *pointer4 = 8; - + */ +static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points) +{ int associated_chip_index; float tcode_half_chips = tcode_half_chips_input; float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; - + for (unsigned int i = 0; i < num_points; i++) - { - associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); - d_very_early_code[i] = d_ca_code[associated_chip_index]; - tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; - } + { + associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); + d_very_early_code[i] = d_ca_code[associated_chip_index]; + tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + } } #endif /* LV_HAVE_GENERIC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h index bc1c1333a..f5f5290e5 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h @@ -2,8 +2,8 @@ * \file volk_gnsssdr_32fc_x5_cw_vepl_corr_32fc_x5 * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt and Late correlation with 64 bits vectors * \authors
    - *
  • Javier Arribas, 2011. jarribas(at)cttc.es - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Javier Arribas, 2011. jarribas(at)cttc.es + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that performs the carrier wipe-off mixing and the @@ -31,7 +31,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -60,7 +60,7 @@ #include - /*! +/*! \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation \param input The input signal input \param carrier The carrier signal input @@ -71,10 +71,10 @@ \param P_out Early correlation output \param L_out Early correlation output \param num_points The number of complex values in vectors - */ + */ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; const unsigned int halfPoints = num_points / 2; lv_32fc_t dotProduct_E; @@ -100,90 +100,90 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_ const lv_32fc_t* _L_code = L_code; for(;number < halfPoints; number++) - { - // carrier wipe-off (vector point-to-point product) - x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + { + // carrier wipe-off (vector point-to-point product) + x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container + //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container - // correlation E,P,L (3x vector scalar product) - // Early - //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi - x = z; + // correlation E,P,L (3x vector scalar product) + // Early + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + x = z; - y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together + z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together - // Prompt - //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + // Prompt + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together + z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together - // Late - //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + // Late + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together + z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together - /*pointer increment*/ - _carrier += 2; - _input += 2; - //_input_BB += 2; - _E_code += 2; - _P_code += 2; - _L_code +=2; - } + /*pointer increment*/ + _carrier += 2; + _input += 2; + //_input_BB += 2; + _E_code += 2; + _P_code += 2; + _L_code +=2; + } __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; @@ -199,12 +199,12 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); if((num_points % 2) != 0) - { - //_input_BB = (*_input) * (*_carrier); - dotProduct_E += (*_input) * (*_E_code)*(*_carrier); - dotProduct_P += (*_input) * (*_P_code)*(*_carrier); - dotProduct_L += (*_input) * (*_L_code)*(*_carrier); - } + { + //_input_BB = (*_input) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code)*(*_carrier); + dotProduct_P += (*_input) * (*_P_code)*(*_carrier); + dotProduct_L += (*_input) * (*_L_code)*(*_carrier); + } *E_out = dotProduct_E; *P_out = dotProduct_P; @@ -229,22 +229,22 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) { lv_32fc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *E_out = 0; *P_out = 0; *L_out = 0; // perform Early, Prompt and Late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get early, late, and prompt values for each - *E_out += bb_signal_sample * E_code[i]; - *P_out += bb_signal_sample * P_code[i]; - *L_out += bb_signal_sample * L_code[i]; - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + } } #endif /* LV_HAVE_GENERIC */ @@ -277,23 +277,23 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E */ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; const unsigned int halfPoints = num_points / 2; - + lv_32fc_t dotProduct_E; memset(&dotProduct_E, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_P; memset(&dotProduct_P, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_L; memset(&dotProduct_L, 0x0, 2*sizeof(float)); - + // Aux vars __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; - + z_E = _mm_setzero_ps(); z_P = _mm_setzero_ps(); z_L = _mm_setzero_ps(); - + //input and output vectors //lv_32fc_t* _input_BB = input_BB; const lv_32fc_t* _input = input; @@ -301,114 +301,114 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_ const lv_32fc_t* _E_code = E_code; const lv_32fc_t* _P_code = P_code; const lv_32fc_t* _L_code = L_code; - + for(;number < halfPoints; number++) - { - // carrier wipe-off (vector point-to-point product) - x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container - - // correlation E,P,L (3x vector scalar product) - // Early - //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi - x = z; - - y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together - - // Prompt - //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together - - // Late - //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together - - /*pointer increment*/ - _carrier += 2; - _input += 2; - //_input_BB += 2; - _E_code += 2; - _P_code += 2; - _L_code +=2; - } - + { + // carrier wipe-off (vector point-to-point product) + x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container + + // correlation E,P,L (3x vector scalar product) + // Early + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + x = z; + + y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 2; + _input += 2; + //_input_BB += 2; + _E_code += 2; + _P_code += 2; + _L_code +=2; + } + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; - + _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector - + dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); - + if((num_points % 2) != 0) - { - //_input_BB = (*_input) * (*_carrier); - dotProduct_E += (*_input) * (*_E_code)*(*_carrier); - dotProduct_P += (*_input) * (*_P_code)*(*_carrier); - dotProduct_L += (*_input) * (*_L_code)*(*_carrier); - } - + { + //_input_BB = (*_input) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code)*(*_carrier); + dotProduct_P += (*_input) * (*_P_code)*(*_carrier); + dotProduct_L += (*_input) * (*_L_code)*(*_carrier); + } + *E_out = dotProduct_E; *P_out = dotProduct_P; *L_out = dotProduct_L; @@ -432,22 +432,22 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) { lv_32fc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *E_out = 0; *P_out = 0; *L_out = 0; // perform Early, Prompt and Late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get early, late, and prompt values for each - *E_out += bb_signal_sample * E_code[i]; - *P_out += bb_signal_sample * P_code[i]; - *L_out += bb_signal_sample * L_code[i]; - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + } } #endif /* LV_HAVE_GENERIC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h index 75540a897..5f4ab6710 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h @@ -2,8 +2,8 @@ * \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5 * \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors * \authors
    - *
  • Javier Arribas, 2011. jarribas(at)cttc.es - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Javier Arribas, 2011. jarribas(at)cttc.es + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that performs the carrier wipe-off mixing and the @@ -35,7 +35,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -77,25 +77,25 @@ */ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; const unsigned int halfPoints = num_points / 4; - + lv_32fc_t dotProduct_VE; lv_32fc_t dotProduct_E; lv_32fc_t dotProduct_P; lv_32fc_t dotProduct_L; lv_32fc_t dotProduct_VL; - + // Aux vars __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; __m256 bb_signal_sample, bb_signal_sample_shuffled; - + z_VE = _mm256_setzero_ps(); z_E = _mm256_setzero_ps(); z_P = _mm256_setzero_ps(); z_L = _mm256_setzero_ps(); z_VL = _mm256_setzero_ps(); - + //input and output vectors const lv_32fc_t* _input = input; const lv_32fc_t* _carrier = carrier; @@ -104,123 +104,123 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE const lv_32fc_t* _P_code = P_code; const lv_32fc_t* _L_code = L_code; const lv_32fc_t* _VL_code = VL_code; - + for(;number < halfPoints; number++) - { - // carrier wipe-off (vector point-to-point product) - x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br - - // correlation VE,E,P,L,VL (5x vector scalar product) - // VE - y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together - - // Early - y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together - - // Prompt - y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together - - // Late - y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together - - // VL - y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together - - /*pointer increment*/ - _carrier += 4; - _input += 4; - _VE_code += 4; - _E_code += 4; - _P_code += 4; - _L_code += 4; - _VL_code += 4; - } - + { + // carrier wipe-off (vector point-to-point product) + x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together + + // Early + y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 4; + _input += 4; + _VE_code += 4; + _E_code += 4; + _P_code += 4; + _L_code += 4; + _VL_code += 4; + } + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; - + _mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector _mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector _mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector _mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector _mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector - + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); - + for (unsigned int i = 0; i<(num_points % 4); ++i) - { - dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); - dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); - dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); - dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); - dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); - } - + { + dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); + dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); + dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); + dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); + } + *VE_out = dotProduct_VE; *E_out = dotProduct_E; *P_out = dotProduct_P; @@ -231,7 +231,7 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE #ifdef LV_HAVE_SSE3 #include - /*! +/*! \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation \param input The input signal input \param carrier The carrier signal input @@ -246,10 +246,10 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE \param L_out Early correlation output \param VL_out VL correlation output \param num_points The number of complex values in vectors - */ + */ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; const unsigned int halfPoints = num_points / 2; lv_32fc_t dotProduct_VE; @@ -278,94 +278,94 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V const lv_32fc_t* _VL_code = VL_code; for(;number < halfPoints; number++) - { - // carrier wipe-off (vector point-to-point product) - x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + { + // carrier wipe-off (vector point-to-point product) + x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br - // correlation VE,E,P,L,VL (5x vector scalar product) - // VE - y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together - - // Early - y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di - // Prompt - y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together - // Late - y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + // Early + y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together - - // VL - //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - /*pointer increment*/ - _carrier += 2; - _input += 2; - _VE_code += 2; - _E_code += 2; - _P_code += 2; - _L_code +=2; - _VL_code +=2; - } + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 2; + _input += 2; + _VE_code += 2; + _E_code += 2; + _P_code += 2; + _L_code +=2; + _VL_code +=2; + } __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; @@ -386,13 +386,13 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); if((num_points % 2) != 0) - { - dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); - dotProduct_E += (*_input) * (*_E_code)*(*_carrier); - dotProduct_P += (*_input) * (*_P_code)*(*_carrier); - dotProduct_L += (*_input) * (*_L_code)*(*_carrier); - dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); - } + { + dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); + dotProduct_E += (*_input) * (*_E_code)*(*_carrier); + dotProduct_P += (*_input) * (*_P_code)*(*_carrier); + dotProduct_L += (*_input) * (*_L_code)*(*_carrier); + dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); + } *VE_out = dotProduct_VE; *E_out = dotProduct_E; @@ -422,9 +422,9 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) { lv_32fc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *VE_out = 0; *E_out = 0; *P_out = 0; @@ -432,16 +432,16 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* *VL_out = 0; // perform Early, Prompt and Late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get early, late, and prompt values for each - *VE_out += bb_signal_sample * VE_code[i]; - *E_out += bb_signal_sample * E_code[i]; - *P_out += bb_signal_sample * P_code[i]; - *L_out += bb_signal_sample * L_code[i]; - *VL_out += bb_signal_sample * VL_code[i]; - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *VE_out += bb_signal_sample * VE_code[i]; + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + *VL_out += bb_signal_sample * VL_code[i]; + } } #endif /* LV_HAVE_GENERIC */ @@ -478,25 +478,25 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* */ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; const unsigned int halfPoints = num_points / 4; - + lv_32fc_t dotProduct_VE; lv_32fc_t dotProduct_E; lv_32fc_t dotProduct_P; lv_32fc_t dotProduct_L; lv_32fc_t dotProduct_VL; - + // Aux vars __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; __m256 bb_signal_sample, bb_signal_sample_shuffled; - + z_VE = _mm256_setzero_ps(); z_E = _mm256_setzero_ps(); z_P = _mm256_setzero_ps(); z_L = _mm256_setzero_ps(); z_VL = _mm256_setzero_ps(); - + //input and output vectors const lv_32fc_t* _input = input; const lv_32fc_t* _carrier = carrier; @@ -505,108 +505,108 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE const lv_32fc_t* _P_code = P_code; const lv_32fc_t* _L_code = L_code; const lv_32fc_t* _VL_code = VL_code; - + for(;number < halfPoints; number++) - { - // carrier wipe-off (vector point-to-point product) - x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br - - // correlation VE,E,P,L,VL (5x vector scalar product) - // VE - y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together - - // Early - y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together - - // Prompt - y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together - - // Late - y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together - - // VL - y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together - - /*pointer increment*/ - _carrier += 4; - _input += 4; - _VE_code += 4; - _E_code += 4; - _P_code += 4; - _L_code += 4; - _VL_code += 4; - } - + { + // carrier wipe-off (vector point-to-point product) + x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together + + // Early + y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 4; + _input += 4; + _VE_code += 4; + _E_code += 4; + _P_code += 4; + _L_code += 4; + _VL_code += 4; + } + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; - + _mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector - + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); @@ -614,14 +614,14 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); for (unsigned int i = 0; i<(num_points % 4); ++i) - { - dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); - dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); - dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); - dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); - dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); - } - + { + dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); + dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); + dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); + dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); + } + *VE_out = dotProduct_VE; *E_out = dotProduct_E; *P_out = dotProduct_P; @@ -650,25 +650,25 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE */ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) { - unsigned int number = 0; + unsigned int number = 0; const unsigned int halfPoints = num_points / 2; - + lv_32fc_t dotProduct_VE; lv_32fc_t dotProduct_E; lv_32fc_t dotProduct_P; lv_32fc_t dotProduct_L; lv_32fc_t dotProduct_VL; - + // Aux vars __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; __m128 bb_signal_sample, bb_signal_sample_shuffled; - + z_VE = _mm_setzero_ps(); z_E = _mm_setzero_ps(); z_P = _mm_setzero_ps(); z_L = _mm_setzero_ps(); z_VL = _mm_setzero_ps(); - + //input and output vectors const lv_32fc_t* _input = input; const lv_32fc_t* _carrier = carrier; @@ -677,124 +677,124 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V const lv_32fc_t* _P_code = P_code; const lv_32fc_t* _L_code = L_code; const lv_32fc_t* _VL_code = VL_code; - + for(;number < halfPoints; number++) - { - // carrier wipe-off (vector point-to-point product) - x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br - - // correlation VE,E,P,L,VL (5x vector scalar product) - // VE - y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together - - // Early - y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together - - // Prompt - y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together - - // Late - y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together - - // VL - //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together - - /*pointer increment*/ - _carrier += 2; - _input += 2; - _VE_code += 2; - _E_code += 2; - _P_code += 2; - _L_code +=2; - _VL_code +=2; - } - + { + // carrier wipe-off (vector point-to-point product) + x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together + + // Early + y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 2; + _input += 2; + _VE_code += 2; + _E_code += 2; + _P_code += 2; + _L_code +=2; + _VL_code +=2; + } + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2]; - + _mm_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector - + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); - + if((num_points % 2) != 0) - { - dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); - dotProduct_E += (*_input) * (*_E_code)*(*_carrier); - dotProduct_P += (*_input) * (*_P_code)*(*_carrier); - dotProduct_L += (*_input) * (*_L_code)*(*_carrier); - dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); - } - + { + dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); + dotProduct_E += (*_input) * (*_E_code)*(*_carrier); + dotProduct_P += (*_input) * (*_P_code)*(*_carrier); + dotProduct_L += (*_input) * (*_L_code)*(*_carrier); + dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); + } + *VE_out = dotProduct_VE; *E_out = dotProduct_E; *P_out = dotProduct_P; @@ -823,9 +823,9 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) { lv_32fc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *VE_out = 0; *E_out = 0; *P_out = 0; @@ -833,16 +833,16 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t *VL_out = 0; // perform Early, Prompt and Late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get early, late, and prompt values for each - *VE_out += bb_signal_sample * VE_code[i]; - *E_out += bb_signal_sample * E_code[i]; - *P_out += bb_signal_sample * P_code[i]; - *L_out += bb_signal_sample * L_code[i]; - *VL_out += bb_signal_sample * VL_code[i]; - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *VE_out += bb_signal_sample * VE_code[i]; + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + *VL_out += bb_signal_sample * VL_code[i]; + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h index 225da74c5..3da9318da 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_64f_accumulator_64f.h * \brief Volk protokernel: 64 bits (double) scalar accumulator * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that implements an accumulator of char values @@ -19,7 +19,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h index 4908d9a89..88be168d7 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8i_accumulator_s8i.h * \brief Volk protokernel: 8 bits (char) scalar accumulator * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that implements an accumulator of char values @@ -19,7 +19,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -47,31 +47,35 @@ \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){ +static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points) +{ char returnValue = 0; const unsigned int sse_iters = num_points / 16; - + const char* aPtr = inputBuffer; - + __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; __m128i accumulator = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128(); - - for(unsigned int number = 0; number < sse_iters; number++){ - aVal = _mm_lddqu_si128((__m128i*)aPtr); - accumulator = _mm_add_epi8(accumulator, aVal); - aPtr += 16; - } + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm_lddqu_si128((__m128i*)aPtr); + accumulator = _mm_add_epi8(accumulator, aVal); + aPtr += 16; + } _mm_storeu_si128((__m128i*)tempBuffer,accumulator); - - for(unsigned int i = 0; i<16; ++i){ - returnValue += tempBuffer[i]; - } - - for(unsigned int i = 0; i<(num_points % 16); ++i){ - returnValue += (*aPtr++); - } - + + for(unsigned int i = 0; i<16; ++i) + { + returnValue += tempBuffer[i]; + } + + for(unsigned int i = 0; i<(num_points % 16); ++i) + { + returnValue += (*aPtr++); + } + *result = returnValue; } #endif /* LV_HAVE_SSE3 */ @@ -83,13 +87,15 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){ +static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points) +{ const char* aPtr = inputBuffer; char returnValue = 0; - - for(unsigned int number = 0;number < num_points; number++){ - returnValue += (*aPtr++); - } + + for(unsigned int number = 0;number < num_points; number++) + { + returnValue += (*aPtr++); + } *result = returnValue; } #endif /* LV_HAVE_GENERIC */ @@ -112,31 +118,34 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const c \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){ +static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points) +{ char returnValue = 0; const unsigned int sse_iters = num_points / 16; - + const char* aPtr = inputBuffer; - + __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; __m128i accumulator = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128(); - - for(unsigned int number = 0; number < sse_iters; number++){ - aVal = _mm_load_si128((__m128i*)aPtr); - accumulator = _mm_add_epi8(accumulator, aVal); - aPtr += 16; - } + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm_load_si128((__m128i*)aPtr); + accumulator = _mm_add_epi8(accumulator, aVal); + aPtr += 16; + } _mm_store_si128((__m128i*)tempBuffer,accumulator); - + for(unsigned int i = 0; i<16; ++i){ - returnValue += tempBuffer[i]; + returnValue += tempBuffer[i]; } - - for(unsigned int i = 0; i<(num_points % 16); ++i){ - returnValue += (*aPtr++); - } - + + for(unsigned int i = 0; i<(num_points % 16); ++i) + { + returnValue += (*aPtr++); + } + *result = returnValue; } #endif /* LV_HAVE_SSE3 */ @@ -148,13 +157,15 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){ +static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points) +{ const char* aPtr = inputBuffer; char returnValue = 0; - - for(unsigned int number = 0;number < num_points; number++){ - returnValue += (*aPtr++); - } + + for(unsigned int number = 0;number < num_points; number++) + { + returnValue += (*aPtr++); + } *result = returnValue; } #endif /* LV_HAVE_GENERIC */ @@ -167,14 +178,15 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const \param num_points The number of values in inputBuffer to be accumulated */ extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points); -static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){ - + +static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points) +{ short res = 0; char* resc = (char*)&res; resc++; - + volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points); - + *result = *resc; } #endif /* LV_HAVE_ORC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h index 8d96476da..b493f8b1d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8i_index_max_16u.h * \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars @@ -19,7 +19,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h index 68ba8fe3b..54eb7e7e2 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8i_max_s8i.h * \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars @@ -19,7 +19,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -47,44 +47,46 @@ \param src0 The buffer of data to be analysed \param num_points The number of values in src0 to be analysed */ -static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) { - if(num_points > 0){ - const unsigned int sse_iters = num_points / 16; - - char* inputPtr = (char*)src0; - char max = src0[0]; - __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; - __m128i maxValues, compareResults, currentValues; - - maxValues = _mm_set1_epi8(max); - - for(unsigned int number = 0; number < sse_iters; number++) +static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) +{ + if(num_points > 0) { - currentValues = _mm_loadu_si128((__m128i*)inputPtr); - compareResults = _mm_cmpgt_epi8(maxValues, currentValues); - maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); - inputPtr += 16; + const unsigned int sse_iters = num_points / 16; + + char* inputPtr = (char*)src0; + char max = src0[0]; + __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_loadu_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); + inputPtr += 16; + } + + _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); + + for(unsigned int i = 0; i<16; ++i) + { + if(maxValuesBuffer[i] > max) + { + max = maxValuesBuffer[i]; + } + } + + for(unsigned int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; } - - _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); - - for(unsigned int i = 0; i<16; ++i) - { - if(maxValuesBuffer[i] > max) - { - max = maxValuesBuffer[i]; - } - } - - for(unsigned int i = 0; i<(num_points % 16); ++i) - { - if(src0[i] > max) - { - max = src0[i]; - } - } - target = max; - } } #endif /*LV_HAVE_SSE4_1*/ @@ -97,55 +99,57 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src \param src0 The buffer of data to be analysed \param num_points The number of values in src0 to be analysed */ -static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) { - if(num_points > 0){ - const unsigned int sse_iters = num_points / 16; - - char* inputPtr = (char*)src0; - char max = src0[0]; - unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; - __m128i maxValues, compareResults, currentValues; - - maxValues = _mm_set1_epi8(max); - - for(unsigned int number = 0; number < sse_iters; number++) +static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) +{ + if(num_points > 0) { - currentValues = _mm_loadu_si128((__m128i*)inputPtr); - compareResults = _mm_cmpgt_epi8(maxValues, currentValues); - mask = _mm_movemask_epi8(compareResults); - - if (mask != 0xFFFF) - { - _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); - mask = ~mask; - int i = 0; - while (mask > 0) + const unsigned int sse_iters = num_points / 16; + + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned short mask; + __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) { - if ((mask & 1) == 1) - { - if(currentValuesBuffer[i] > max) + currentValues = _mm_loadu_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + mask = _mm_movemask_epi8(compareResults); + + if (mask != 0xFFFF) { - max = currentValuesBuffer[i]; + _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); + mask = ~mask; + int i = 0; + while (mask > 0) + { + if ((mask & 1) == 1) + { + if(currentValuesBuffer[i] > max) + { + max = currentValuesBuffer[i]; + } + } + i++; + mask >>= 1; + } + maxValues = _mm_set1_epi8(max); } - } - i++; - mask >>= 1; + inputPtr += 16; } - maxValues = _mm_set1_epi8(max); - } - inputPtr += 16; + + for(unsigned int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; } - - for(unsigned int i = 0; i<(num_points % 16); ++i) - { - if(src0[i] > max) - { - max = src0[i]; - } - } - target = max; - } } #endif /*LV_HAVE_SSE2*/ @@ -157,20 +161,21 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, \param src0 The buffer of data to be analysed \param num_points The number of values in src0 to be analysed */ -static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) { +static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) +{ if(num_points > 0) - { - char max = src0[0]; - - for(unsigned int i = 1; i < num_points; ++i) { - if(src0[i] > max) - { - max = src0[i]; - } + char max = src0[0]; + + for(unsigned int i = 1; i < num_points; ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; } - target = max; - } } #endif /*LV_HAVE_GENERIC*/ @@ -193,44 +198,46 @@ static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0 \param src0 The buffer of data to be analysed \param num_points The number of values in src0 to be analysed */ -static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) { - if(num_points > 0){ - const unsigned int sse_iters = num_points / 16; - - char* inputPtr = (char*)src0; - char max = src0[0]; - __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; - __m128i maxValues, compareResults, currentValues; - - maxValues = _mm_set1_epi8(max); - - for(unsigned int number = 0; number < sse_iters; number++) +static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) +{ + if(num_points > 0) { - currentValues = _mm_load_si128((__m128i*)inputPtr); - compareResults = _mm_cmpgt_epi8(maxValues, currentValues); - maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); - inputPtr += 16; + const unsigned int sse_iters = num_points / 16; + + char* inputPtr = (char*)src0; + char max = src0[0]; + __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_load_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); + inputPtr += 16; + } + + _mm_store_si128((__m128i*)maxValuesBuffer, maxValues); + + for(unsigned int i = 0; i<16; ++i) + { + if(maxValuesBuffer[i] > max) + { + max = maxValuesBuffer[i]; + } + } + + for(unsigned int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; } - - _mm_store_si128((__m128i*)maxValuesBuffer, maxValues); - - for(unsigned int i = 0; i<16; ++i) - { - if(maxValuesBuffer[i] > max) - { - max = maxValuesBuffer[i]; - } - } - - for(unsigned int i = 0; i<(num_points % 16); ++i) - { - if(src0[i] > max) - { - max = src0[i]; - } - } - target = max; - } } #endif /*LV_HAVE_SSE4_1*/ @@ -243,55 +250,57 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src \param src0 The buffer of data to be analysed \param num_points The number of values in src0 to be analysed */ -static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) { - if(num_points > 0){ - const unsigned int sse_iters = num_points / 16; - - char* inputPtr = (char*)src0; - char max = src0[0]; - unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; - __m128i maxValues, compareResults, currentValues; - - maxValues = _mm_set1_epi8(max); - - for(unsigned int number = 0; number < sse_iters; number++) +static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) +{ + if(num_points > 0) { - currentValues = _mm_load_si128((__m128i*)inputPtr); - compareResults = _mm_cmpgt_epi8(maxValues, currentValues); - mask = _mm_movemask_epi8(compareResults); - - if (mask != 0xFFFF) - { - _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); - mask = ~mask; - int i = 0; - while (mask > 0) + const unsigned int sse_iters = num_points / 16; + + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned short mask; + __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) { - if ((mask & 1) == 1) - { - if(currentValuesBuffer[i] > max) + currentValues = _mm_load_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + mask = _mm_movemask_epi8(compareResults); + + if (mask != 0xFFFF) { - max = currentValuesBuffer[i]; + _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); + mask = ~mask; + int i = 0; + while (mask > 0) + { + if ((mask & 1) == 1) + { + if(currentValuesBuffer[i] > max) + { + max = currentValuesBuffer[i]; + } + } + i++; + mask >>= 1; + } + maxValues = _mm_set1_epi8(max); } - } - i++; - mask >>= 1; + inputPtr += 16; } - maxValues = _mm_set1_epi8(max); - } - inputPtr += 16; + + for(unsigned int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; } - - for(unsigned int i = 0; i<(num_points % 16); ++i) - { - if(src0[i] > max) - { - max = src0[i]; - } - } - target = max; - } } #endif /*LV_HAVE_SSE2*/ @@ -303,23 +312,23 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, \param src0 The buffer of data to be analysed \param num_points The number of values in src0 to be analysed */ -static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) { +static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) +{ if(num_points > 0) - { - if(num_points > 0) { - char max = src0[0]; - - for(unsigned int i = 1; i < num_points; ++i) - { - if(src0[i] > max) + if(num_points > 0) { - max = src0[i]; + char max = src0[0]; + for(unsigned int i = 1; i < num_points; ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; } - } - target = max; } - } } #endif /*LV_HAVE_GENERIC*/ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h index bdfde476c..2e49acf25 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8i_x2_add_8i.h * \brief Volk protokernel: adds pairs of 8 bits (char) scalars * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that adds pairs of 8 bits (char) scalars @@ -19,7 +19,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -47,34 +47,34 @@ \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ - +static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 16; - + char* cPtr = cVector; const char* aPtr = aVector; const char* bPtr= bVector; - + __m128i aVal, bVal, cVal; - - for(unsigned int number = 0; number < sse_iters; number++){ - - aVal = _mm_loadu_si128((__m128i*)aPtr); - bVal = _mm_loadu_si128((__m128i*)bPtr); - - cVal = _mm_add_epi8(aVal, bVal); - - _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container - - aPtr += 16; - bPtr += 16; - cPtr += 16; - } - + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm_loadu_si128((__m128i*)aPtr); + bVal = _mm_loadu_si128((__m128i*)bPtr); + + cVal = _mm_add_epi8(aVal, bVal); + + _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container + + aPtr += 16; + bPtr += 16; + cPtr += 16; + } + for(unsigned int i = 0; i<(num_points % 16); ++i) - { - *cPtr++ = (*aPtr++) + (*bPtr++); - } + { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_SSE2 */ @@ -86,15 +86,17 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ +static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points) +{ char* cPtr = cVector; const char* aPtr = aVector; const char* bPtr= bVector; unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + + for(number = 0; number < num_points; number++) + { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -116,34 +118,34 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ - +static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 16; - + char* cPtr = cVector; const char* aPtr = aVector; const char* bPtr= bVector; - + __m128i aVal, bVal, cVal; - - for(unsigned int number = 0; number < sse_iters; number++){ - - aVal = _mm_load_si128((__m128i*)aPtr); - bVal = _mm_load_si128((__m128i*)bPtr); - - cVal = _mm_add_epi8(aVal, bVal); - - _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container - - aPtr += 16; - bPtr += 16; - cPtr += 16; - } - + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm_load_si128((__m128i*)aPtr); + bVal = _mm_load_si128((__m128i*)bPtr); + + cVal = _mm_add_epi8(aVal, bVal); + + _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container + + aPtr += 16; + bPtr += 16; + cPtr += 16; + } + for(unsigned int i = 0; i<(num_points % 16); ++i) - { - *cPtr++ = (*aPtr++) + (*bPtr++); - } + { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_SSE2 */ @@ -155,15 +157,17 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a \param bVector One of the vectors to be added \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ -static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ +static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points) +{ char* cPtr = cVector; const char* aPtr = aVector; const char* bPtr= bVector; unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } + + for(number = 0; number < num_points; number++) + { + *cPtr++ = (*aPtr++) + (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -176,7 +180,8 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char \param num_points The number of values in aVector and bVector to be added together and stored into cVector */ extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points); -static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ +static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points) +{ volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h index ce8dad8c4..97618a15c 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_conjugate_8ic.h * \brief Volk protokernel: calculates the conjugate of a 16 bits vector * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that calculates the conjugate of a @@ -20,7 +20,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -48,37 +48,38 @@ \param aVector Vector to be conjugated \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector */ -static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 16; - + lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; - + __m256 tmp; __m128i tmp128lo, tmp128hi; __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); - + for (unsigned int i = 0; i < sse_iters; ++i) - { - tmp = _mm256_loadu_ps((float*)a); - tmp = _mm256_xor_ps(tmp, conjugator1); - tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); - tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); - tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); - tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); - //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h - tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); - _mm256_storeu_ps((float*)c, tmp); - - a += 16; - c += 16; - } - + { + tmp = _mm256_loadu_ps((float*)a); + tmp = _mm256_xor_ps(tmp, conjugator1); + tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); + tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); + tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); + tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); + //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h + tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); + _mm256_storeu_ps((float*)c, tmp); + + a += 16; + c += 16; + } + for (unsigned int i = 0; i<(num_points % 16); ++i) - { - *c++ = lv_conj(*a++); - } + { + *c++ = lv_conj(*a++); + } } #endif /* LV_HAVE_AVX */ @@ -90,29 +91,30 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const \param aVector Vector to be conjugated \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector */ -static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; __m128i tmp; - + __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); - + for (unsigned int i = 0; i < sse_iters; ++i) - { - tmp = _mm_lddqu_si128((__m128i*)a); - tmp = _mm_sign_epi8(tmp, conjugator); - _mm_storeu_si128((__m128i*)c, tmp); - a += 8; - c += 8; - } - + { + tmp = _mm_lddqu_si128((__m128i*)a); + tmp = _mm_sign_epi8(tmp, conjugator); + _mm_storeu_si128((__m128i*)c, tmp); + a += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = lv_conj(*a++); - } - + { + *c++ = lv_conj(*a++); + } + } #endif /* LV_HAVE_SSSE3 */ @@ -124,31 +126,32 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con \param aVector Vector to be conjugated \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector */ -static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; __m128i tmp; - + __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); - + for (unsigned int i = 0; i < sse_iters; ++i) - { - tmp = _mm_lddqu_si128((__m128i*)a); - tmp = _mm_xor_si128(tmp, conjugator1); - tmp = _mm_add_epi8(tmp, conjugator2); - _mm_storeu_si128((__m128i*)c, tmp); - a += 8; - c += 8; - } - + { + tmp = _mm_lddqu_si128((__m128i*)a); + tmp = _mm_xor_si128(tmp, conjugator1); + tmp = _mm_add_epi8(tmp, conjugator2); + _mm_storeu_si128((__m128i*)c, tmp); + a += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = lv_conj(*a++); - } - + { + *c++ = lv_conj(*a++); + } + } #endif /* LV_HAVE_SSE3 */ @@ -159,14 +162,16 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons \param aVector Vector to be conjugated \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector */ -static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ lv_8sc_t* cPtr = cVector; const lv_8sc_t* aPtr = aVector; unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_conj(*aPtr++); - } + + for(number = 0; number < num_points; number++) + { + *cPtr++ = lv_conj(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -188,37 +193,38 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con \param aVector Vector to be conjugated \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector */ -static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 16; - + lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; - + __m256 tmp; __m128i tmp128lo, tmp128hi; __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); - + for (unsigned int i = 0; i < sse_iters; ++i) - { - tmp = _mm256_load_ps((float*)a); - tmp = _mm256_xor_ps(tmp, conjugator1); - tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); - tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); - tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); - tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); - //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h - tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); - _mm256_store_ps((float*)c, tmp); - - a += 16; - c += 16; - } - + { + tmp = _mm256_load_ps((float*)a); + tmp = _mm256_xor_ps(tmp, conjugator1); + tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); + tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); + tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); + tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); + //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h + tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); + _mm256_store_ps((float*)c, tmp); + + a += 16; + c += 16; + } + for (unsigned int i = 0; i<(num_points % 16); ++i) - { - *c++ = lv_conj(*a++); - } + { + *c++ = lv_conj(*a++); + } } #endif /* LV_HAVE_AVX */ @@ -230,29 +236,30 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const \param aVector Vector to be conjugated \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector */ -static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; __m128i tmp; - + __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); - + for (unsigned int i = 0; i < sse_iters; ++i) - { - tmp = _mm_load_si128((__m128i*)a); - tmp = _mm_sign_epi8(tmp, conjugator); - _mm_store_si128((__m128i*)c, tmp); - a += 8; - c += 8; - } - + { + tmp = _mm_load_si128((__m128i*)a); + tmp = _mm_sign_epi8(tmp, conjugator); + _mm_store_si128((__m128i*)c, tmp); + a += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = lv_conj(*a++); - } - + { + *c++ = lv_conj(*a++); + } + } #endif /* LV_HAVE_SSSE3 */ @@ -264,31 +271,32 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, con \param aVector Vector to be conjugated \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector */ -static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; __m128i tmp; - + __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); - + for (unsigned int i = 0; i < sse_iters; ++i) - { - tmp = _mm_load_si128((__m128i*)a); - tmp = _mm_xor_si128(tmp, conjugator1); - tmp = _mm_add_epi8(tmp, conjugator2); - _mm_store_si128((__m128i*)c, tmp); - a += 8; - c += 8; - } - + { + tmp = _mm_load_si128((__m128i*)a); + tmp = _mm_xor_si128(tmp, conjugator1); + tmp = _mm_add_epi8(tmp, conjugator2); + _mm_store_si128((__m128i*)c, tmp); + a += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = lv_conj(*a++); - } - + { + *c++ = lv_conj(*a++); + } + } #endif /* LV_HAVE_SSE3 */ @@ -299,14 +307,16 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons \param aVector Vector to be conjugated \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector */ -static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ lv_8sc_t* cPtr = cVector; const lv_8sc_t* aPtr = aVector; unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_conj(*aPtr++); - } + + for(number = 0; number < num_points; number++) + { + *cPtr++ = lv_conj(*aPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -318,7 +328,8 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, c \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector */ extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points); -static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h index f7de8420f..de8ea80bb 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_magnitude_squared_8i.h * \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that calculates the magnitude squared of a @@ -21,7 +21,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -49,56 +49,54 @@ \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 16; - + const char* complexVectorPtr = (char*)complexVector; char* magnitudeVectorPtr = magnitudeVector; - + __m128i zero, result8; __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; - + zero = _mm_setzero_si128(); maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - + for(unsigned int number = 0;number < sse_iters; number++) - { - avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); - avectorlo = _mm_unpacklo_epi8 (avector, zero); - avectorhi = _mm_unpackhi_epi8 (avector, zero); - avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); - avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); - aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); - - complexVectorPtr += 16; - - bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); - bvectorlo = _mm_unpacklo_epi8 (bvector, zero); - bvectorhi = _mm_unpackhi_epi8 (bvector, zero); - bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); - bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); - badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); - - complexVectorPtr += 16; - - result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); - - _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8); - - magnitudeVectorPtr += 16; - - - } - + { + avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); + avectorlo = _mm_unpacklo_epi8 (avector, zero); + avectorhi = _mm_unpackhi_epi8 (avector, zero); + avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); + avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); + aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); + + complexVectorPtr += 16; + + bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); + bvectorlo = _mm_unpacklo_epi8 (bvector, zero); + bvectorhi = _mm_unpackhi_epi8 (bvector, zero); + bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); + bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); + badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); + + complexVectorPtr += 16; + + result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); + + _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8); + + magnitudeVectorPtr += 16; + } + for (unsigned int i = 0; i<(num_points % 16); ++i) - { - const char valReal = *complexVectorPtr++; - const char valImag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); - } + { + const char valReal = *complexVectorPtr++; + const char valImag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); + } } #endif /* LV_HAVE_SSSE3 */ @@ -155,15 +153,17 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points) +{ const char* complexVectorPtr = (char*)complexVector; char* magnitudeVectorPtr = magnitudeVector; - - for(unsigned int number = 0; number < num_points; number++){ - const char real = *complexVectorPtr++; - const char imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } + + for(unsigned int number = 0; number < num_points; number++) + { + const char real = *complexVectorPtr++; + const char imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } } #endif /* LV_HAVE_GENERIC */ @@ -185,56 +185,54 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitude \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 16; - + const char* complexVectorPtr = (char*)complexVector; char* magnitudeVectorPtr = magnitudeVector; - + __m128i zero, result8; __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; - + zero = _mm_setzero_si128(); maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - + for(unsigned int number = 0;number < sse_iters; number++) - { - avector = _mm_load_si128((__m128i*)complexVectorPtr); - avectorlo = _mm_unpacklo_epi8 (avector, zero); - avectorhi = _mm_unpackhi_epi8 (avector, zero); - avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); - avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); - aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); - - complexVectorPtr += 16; - - bvector = _mm_load_si128((__m128i*)complexVectorPtr); - bvectorlo = _mm_unpacklo_epi8 (bvector, zero); - bvectorhi = _mm_unpackhi_epi8 (bvector, zero); - bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); - bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); - badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); - - complexVectorPtr += 16; - - result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); - - _mm_store_si128((__m128i*)magnitudeVectorPtr, result8); - - magnitudeVectorPtr += 16; - - - } - + { + avector = _mm_load_si128((__m128i*)complexVectorPtr); + avectorlo = _mm_unpacklo_epi8 (avector, zero); + avectorhi = _mm_unpackhi_epi8 (avector, zero); + avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); + avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); + aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); + + complexVectorPtr += 16; + + bvector = _mm_load_si128((__m128i*)complexVectorPtr); + bvectorlo = _mm_unpacklo_epi8 (bvector, zero); + bvectorhi = _mm_unpackhi_epi8 (bvector, zero); + bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); + bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); + badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); + + complexVectorPtr += 16; + + result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); + + _mm_store_si128((__m128i*)magnitudeVectorPtr, result8); + + magnitudeVectorPtr += 16; + } + for (unsigned int i = 0; i<(num_points % 16); ++i) - { - const char valReal = *complexVectorPtr++; - const char valImag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); - } + { + const char valReal = *complexVectorPtr++; + const char valImag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); + } } #endif /* LV_HAVE_SSSE3 */ @@ -291,15 +289,17 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV \param magnitudeVector The vector containing the real output values \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ -static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points) +{ const char* complexVectorPtr = (char*)complexVector; char* magnitudeVectorPtr = magnitudeVector; - - for(unsigned int number = 0; number < num_points; number++){ - const char real = *complexVectorPtr++; - const char imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } + + for(unsigned int number = 0; number < num_points; number++) + { + const char real = *complexVectorPtr++; + const char imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } } #endif /* LV_HAVE_GENERIC */ @@ -311,7 +311,8 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitu \param num_points The number of complex values in complexVector to be calculated and stored into cVector */ extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points); -static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points) +{ volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h index ae96ed6c6..129796077 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h * \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that multiplies a group of 16 bits vectors @@ -20,7 +20,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -50,54 +50,54 @@ \param scalar The complex scalar to multiply aVector \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector */ -static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; - + lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + y = _mm_set1_epi16 (*(short*)&scalar); imagy = _mm_srli_si128 (y, 1); imagy = _mm_and_si128 (imagy, mult1); realy = _mm_and_si128 (y, mult1); - - for(unsigned int number = 0;number < sse_iters; number++){ - - x = _mm_lddqu_si128((__m128i*)a); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - realc = _mm_and_si128 (realc, mult1); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - imagc = _mm_and_si128 (imagc, mult1); - imagc = _mm_slli_si128 (imagc, 1); - - totalc = _mm_or_si128 (realc, imagc); - - _mm_storeu_si128((__m128i*)c, totalc); - - a += 8; - c += 8; - } - + + for(unsigned int number = 0;number < sse_iters; number++) + { + x = _mm_lddqu_si128((__m128i*)a); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + realc = _mm_and_si128 (realc, mult1); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_and_si128 (imagc, mult1); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_or_si128 (realc, imagc); + + _mm_storeu_si128((__m128i*)c, totalc); + + a += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = (*a++) * scalar; - } - + { + *c++ = (*a++) * scalar; + } + } #endif /* LV_HAVE_SSE3 */ @@ -109,33 +109,34 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, \param scalar The complex scalar to multiply aVector \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector */ -static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points) +{ /*lv_8sc_t* cPtr = cVector; const lv_8sc_t* aPtr = aVector; - + for (int i = 0; i= 8){ - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - number -= 8; - } - + while (number >= 8) + { + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + // clean up any remaining while (number-- > 0) *cPtr++ = *aPtr++ * scalar; @@ -162,54 +163,54 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, \param scalar The complex scalar to multiply aVector \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector */ -static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; - + lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + y = _mm_set1_epi16 (*(short*)&scalar); imagy = _mm_srli_si128 (y, 1); imagy = _mm_and_si128 (imagy, mult1); realy = _mm_and_si128 (y, mult1); - - for(unsigned int number = 0;number < sse_iters; number++){ - - x = _mm_load_si128((__m128i*)a); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - realc = _mm_and_si128 (realc, mult1); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - imagc = _mm_and_si128 (imagc, mult1); - imagc = _mm_slli_si128 (imagc, 1); - - totalc = _mm_or_si128 (realc, imagc); - - _mm_store_si128((__m128i*)c, totalc); - - a += 8; - c += 8; - } - + + for(unsigned int number = 0;number < sse_iters; number++) + { + x = _mm_load_si128((__m128i*)a); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + realc = _mm_and_si128 (realc, mult1); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_and_si128 (imagc, mult1); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_or_si128 (realc, imagc); + + _mm_store_si128((__m128i*)c, totalc); + + a += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = (*a++) * scalar; - } - + { + *c++ = (*a++) * scalar; + } + } #endif /* LV_HAVE_SSE3 */ @@ -221,33 +222,33 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, \param scalar The complex scalar to multiply aVector \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector */ -static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points) +{ /*lv_8sc_t* cPtr = cVector; const lv_8sc_t* aPtr = aVector; - + for (int i = 0; i= 8){ - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - number -= 8; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; } - + // clean up any remaining while (number-- > 0) *cPtr++ = *aPtr++ * scalar; @@ -263,7 +264,8 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVecto \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector */ extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points); -static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points) +{ volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points); } #endif /* LV_HAVE_ORC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h index ba416ed9b..5721ca9e7 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part @@ -20,7 +20,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -49,43 +49,45 @@ \param bVector One of the vectors to be multiplied and accumulated \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { - +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) +{ /*lv_8sc_t* cPtr = result; const lv_8sc_t* aPtr = input; const lv_8sc_t* bPtr = taps; - + for(int number = 0; number < num_points; number++){ *cPtr += (*aPtr++) * (*bPtr++); }*/ - + char * res = (char*) result; char * in = (char*) input; char * tp = (char*) taps; unsigned int n_2_ccomplex_blocks = num_points/2; unsigned int isodd = num_points & 1; - + char sum0[2] = {0,0}; char sum1[2] = {0,0}; unsigned int i = 0; - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - in += 4; - tp += 4; - } - + + for(i = 0; i < n_2_ccomplex_blocks; ++i) + { + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + in += 4; + tp += 4; + } + res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - + // Cleanup if we had an odd number of points - for(i = 0; i < isodd; ++i) { - *result += input[num_points - 1] * taps[num_points - 1]; - } + for(i = 0; i < isodd; ++i) + { + *result += input[num_points - 1] * taps[num_points - 1]; + } } #endif /*LV_HAVE_GENERIC*/ @@ -99,73 +101,73 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co \param bVector One of the vectors to be multiplied and accumulated \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { - +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) +{ lv_8sc_t dotProduct; memset(&dotProduct, 0x0, 2*sizeof(char)); - + const lv_8sc_t* a = input; const lv_8sc_t* b = taps; - + const unsigned int sse_iters = num_points/8; - + if (sse_iters>0) - { - __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; - - mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - realcacc = _mm_setzero_si128(); - imagcacc = _mm_setzero_si128(); - - for(unsigned int number = 0; number < sse_iters; number++){ - - x = _mm_loadu_si128((__m128i*)a); - y = _mm_loadu_si128((__m128i*)b); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - realcacc = _mm_add_epi16 (realcacc, realc); - imagcacc = _mm_add_epi16 (imagcacc, imagc); - - a += 8; - b += 8; - } - - realcacc = _mm_and_si128 (realcacc, mult1); - imagcacc = _mm_and_si128 (imagcacc, mult1); - imagcacc = _mm_slli_si128 (imagcacc, 1); - - totalc = _mm_or_si128 (realcacc, imagcacc); - - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; - - _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector - - for (int i = 0; i<8; ++i) { - dotProduct += dotProductVector[i]; + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + x = _mm_loadu_si128((__m128i*)a); + y = _mm_loadu_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + realcacc = _mm_add_epi16 (realcacc, realc); + imagcacc = _mm_add_epi16 (imagcacc, imagc); + + a += 8; + b += 8; + } + + realcacc = _mm_and_si128 (realcacc, mult1); + imagcacc = _mm_and_si128 (imagcacc, mult1); + imagcacc = _mm_slli_si128 (imagcacc, 1); + + totalc = _mm_or_si128 (realcacc, imagcacc); + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + + _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector + + for (int i = 0; i<8; ++i) + { + dotProduct += dotProductVector[i]; + } } - } - + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - dotProduct += (*a++) * (*b++); - } - + { + dotProduct += (*a++) * (*b++); + } + *result = dotProduct; } @@ -180,71 +182,71 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con \param bVector One of the vectors to be multiplied and accumulated \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { - +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) +{ lv_8sc_t dotProduct; memset(&dotProduct, 0x0, 2*sizeof(char)); - + const lv_8sc_t* a = input; const lv_8sc_t* b = taps; - + const unsigned int sse_iters = num_points/8; - + if (sse_iters>0) - { - __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; - - mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - realcacc = _mm_setzero_si128(); - imagcacc = _mm_setzero_si128(); - - for(unsigned int number = 0; number < sse_iters; number++){ - - x = _mm_lddqu_si128((__m128i*)a); - y = _mm_lddqu_si128((__m128i*)b); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - realcacc = _mm_add_epi16 (realcacc, realc); - imagcacc = _mm_add_epi16 (imagcacc, imagc); - - a += 8; - b += 8; - } - - imagcacc = _mm_slli_si128 (imagcacc, 1); - - totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); - - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; - - _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector - - for (unsigned int i = 0; i<8; ++i) { - dotProduct += dotProductVector[i]; + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + x = _mm_lddqu_si128((__m128i*)a); + y = _mm_lddqu_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + realcacc = _mm_add_epi16 (realcacc, realc); + imagcacc = _mm_add_epi16 (imagcacc, imagc); + + a += 8; + b += 8; + } + + imagcacc = _mm_slli_si128 (imagcacc, 1); + + totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + + _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector + + for (unsigned int i = 0; i<8; ++i) + { + dotProduct += dotProductVector[i]; + } } - } - + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - dotProduct += (*a++) * (*b++); - } - + { + dotProduct += (*a++) * (*b++); + } + *result = dotProduct; } @@ -270,43 +272,45 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c \param bVector One of the vectors to be multiplied and accumulated \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { - +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) +{ /*lv_8sc_t* cPtr = result; const lv_8sc_t* aPtr = input; const lv_8sc_t* bPtr = taps; - + for(int number = 0; number < num_points; number++){ *cPtr += (*aPtr++) * (*bPtr++); }*/ - + char * res = (char*) result; char * in = (char*) input; char * tp = (char*) taps; unsigned int n_2_ccomplex_blocks = num_points/2; unsigned int isodd = num_points & 1; - + char sum0[2] = {0,0}; char sum1[2] = {0,0}; unsigned int i = 0; - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - in += 4; - tp += 4; - } - + + for(i = 0; i < n_2_ccomplex_blocks; ++i) + { + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + in += 4; + tp += 4; + } + res[0] = sum0[0] + sum1[0]; res[1] = sum0[1] + sum1[1]; - + // Cleanup if we had an odd number of points - for(i = 0; i < isodd; ++i) { - *result += input[num_points - 1] * taps[num_points - 1]; - } + for(i = 0; i < isodd; ++i) + { + *result += input[num_points - 1] * taps[num_points - 1]; + } } #endif /*LV_HAVE_GENERIC*/ @@ -320,73 +324,73 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, \param bVector One of the vectors to be multiplied and accumulated \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { - +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) +{ lv_8sc_t dotProduct; memset(&dotProduct, 0x0, 2*sizeof(char)); - + const lv_8sc_t* a = input; const lv_8sc_t* b = taps; - + const unsigned int sse_iters = num_points/8; - + if (sse_iters>0) - { - __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; - - mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - realcacc = _mm_setzero_si128(); - imagcacc = _mm_setzero_si128(); - - for(unsigned int number = 0; number < sse_iters; number++){ - - x = _mm_load_si128((__m128i*)a); - y = _mm_load_si128((__m128i*)b); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - realcacc = _mm_add_epi16 (realcacc, realc); - imagcacc = _mm_add_epi16 (imagcacc, imagc); - - a += 8; - b += 8; - } - - realcacc = _mm_and_si128 (realcacc, mult1); - imagcacc = _mm_and_si128 (imagcacc, mult1); - imagcacc = _mm_slli_si128 (imagcacc, 1); - - totalc = _mm_or_si128 (realcacc, imagcacc); - - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; - - _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector - - for (unsigned int i = 0; i<8; ++i) { - dotProduct += dotProductVector[i]; + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + realcacc = _mm_add_epi16 (realcacc, realc); + imagcacc = _mm_add_epi16 (imagcacc, imagc); + + a += 8; + b += 8; + } + + realcacc = _mm_and_si128 (realcacc, mult1); + imagcacc = _mm_and_si128 (imagcacc, mult1); + imagcacc = _mm_slli_si128 (imagcacc, 1); + + totalc = _mm_or_si128 (realcacc, imagcacc); + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + + _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector + + for (unsigned int i = 0; i<8; ++i) + { + dotProduct += dotProductVector[i]; + } } - } - + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - dotProduct += (*a++) * (*b++); - } - + { + dotProduct += (*a++) * (*b++); + } + *result = dotProduct; } @@ -401,71 +405,71 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con \param bVector One of the vectors to be multiplied and accumulated \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { - +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) +{ lv_8sc_t dotProduct; memset(&dotProduct, 0x0, 2*sizeof(char)); - + const lv_8sc_t* a = input; const lv_8sc_t* b = taps; - + const unsigned int sse_iters = num_points/8; - + if (sse_iters>0) - { - __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; - - mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - realcacc = _mm_setzero_si128(); - imagcacc = _mm_setzero_si128(); - - for(unsigned int number = 0; number < sse_iters; number++){ - - x = _mm_load_si128((__m128i*)a); - y = _mm_load_si128((__m128i*)b); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - realcacc = _mm_add_epi16 (realcacc, realc); - imagcacc = _mm_add_epi16 (imagcacc, imagc); - - a += 8; - b += 8; - } - - imagcacc = _mm_slli_si128 (imagcacc, 1); - - totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); - - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; - - _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector - - for (unsigned int i = 0; i<8; ++i) { - dotProduct += dotProductVector[i]; + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + realcacc = _mm_add_epi16 (realcacc, realc); + imagcacc = _mm_add_epi16 (imagcacc, imagc); + + a += 8; + b += 8; + } + + imagcacc = _mm_slli_si128 (imagcacc, 1); + + totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + + _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector + + for (unsigned int i = 0; i<8; ++i) + { + dotProduct += dotProductVector[i]; + } } - } - + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - dotProduct += (*a++) * (*b++); - } - + { + dotProduct += (*a++) * (*b++); + } + *result = dotProduct; } @@ -480,18 +484,18 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector */ extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points); -static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) +{ short resReal = 0; char* resRealChar = (char*)&resReal; resRealChar++; - + short resImag = 0; char* resImagChar = (char*)&resImag; resImagChar++; - + volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points); - + *result = lv_cmake(*resRealChar, *resImagChar); } #endif /* LV_HAVE_ORC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h index b5c551b93..1cc37d09b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_x2_multiply_8ic.h * \brief Volk protokernel: multiplies two 16 bits vectors * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part @@ -20,7 +20,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -49,54 +49,54 @@ \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - - for(unsigned int number = 0;number < sse_iters; number++){ - - x = _mm_loadu_si128((__m128i*)a); - y = _mm_loadu_si128((__m128i*)b); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - realc = _mm_and_si128 (realc, mult1); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - imagc = _mm_and_si128 (imagc, mult1); - imagc = _mm_slli_si128 (imagc, 1); - - totalc = _mm_or_si128 (realc, imagc); - - _mm_storeu_si128((__m128i*)c, totalc); - - a += 8; - b += 8; - c += 8; - } - + + for(unsigned int number = 0;number < sse_iters; number++) + { + x = _mm_loadu_si128((__m128i*)a); + y = _mm_loadu_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + realc = _mm_and_si128 (realc, mult1); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_and_si128 (imagc, mult1); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_or_si128 (realc, imagc); + + _mm_storeu_si128((__m128i*)c, totalc); + + a += 8; + b += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = (*a++) * (*b++); - } + { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_SSE2 */ @@ -109,54 +109,54 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + __m128i x, y, zero; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; - + zero = _mm_setzero_si128(); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - - for(unsigned int number = 0;number < sse_iters; number++){ - - x = _mm_lddqu_si128((__m128i*)a); - y = _mm_lddqu_si128((__m128i*)b); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - imagc = _mm_slli_si128 (imagc, 1); - - totalc = _mm_blendv_epi8 (imagc, realc, mult1); - - _mm_storeu_si128((__m128i*)c, totalc); - - a += 8; - b += 8; - c += 8; - } - + + for(unsigned int number = 0;number < sse_iters; number++) + { + x = _mm_lddqu_si128((__m128i*)a); + y = _mm_lddqu_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_blendv_epi8 (imagc, realc, mult1); + + _mm_storeu_si128((__m128i*)c, totalc); + + a += 8; + b += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = (*a++) * (*b++); - } + { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -168,14 +168,16 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points) +{ lv_8sc_t* cPtr = cVector; const lv_8sc_t* aPtr = aVector; const lv_8sc_t* bPtr = bVector; - - for(unsigned int number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + + for(unsigned int number = 0; number < num_points; number++) + { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -198,54 +200,54 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - - for(unsigned int number = 0;number < sse_iters; number++){ - - x = _mm_load_si128((__m128i*)a); - y = _mm_load_si128((__m128i*)b); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - realc = _mm_and_si128 (realc, mult1); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - imagc = _mm_and_si128 (imagc, mult1); - imagc = _mm_slli_si128 (imagc, 1); - - totalc = _mm_or_si128 (realc, imagc); - - _mm_store_si128((__m128i*)c, totalc); - - a += 8; - b += 8; - c += 8; - } - + + for(unsigned int number = 0;number < sse_iters; number++) + { + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + realc = _mm_and_si128 (realc, mult1); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_and_si128 (imagc, mult1); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_or_si128 (realc, imagc); + + _mm_store_si128((__m128i*)c, totalc); + + a += 8; + b += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = (*a++) * (*b++); - } + { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_SSE2 */ @@ -258,54 +260,54 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ - +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 8; - + __m128i x, y, zero; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; lv_8sc_t* c = cVector; const lv_8sc_t* a = aVector; const lv_8sc_t* b = bVector; - + zero = _mm_setzero_si128(); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - - for(unsigned int number = 0;number < sse_iters; number++){ - - x = _mm_load_si128((__m128i*)a); - y = _mm_load_si128((__m128i*)b); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - imagc = _mm_slli_si128 (imagc, 1); - - totalc = _mm_blendv_epi8 (imagc, realc, mult1); - - _mm_store_si128((__m128i*)c, totalc); - - a += 8; - b += 8; - c += 8; - } - + + for(unsigned int number = 0;number < sse_iters; number++) + { + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_blendv_epi8 (imagc, realc, mult1); + + _mm_store_si128((__m128i*)c, totalc); + + a += 8; + b += 8; + c += 8; + } + for (unsigned int i = 0; i<(num_points % 8); ++i) - { - *c++ = (*a++) * (*b++); - } + { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -317,15 +319,17 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, \param bVector One of the vectors to be multiplied \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ -static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points) +{ lv_8sc_t* cPtr = cVector; const lv_8sc_t* aPtr = aVector; const lv_8sc_t* bPtr = bVector; - - for(unsigned int number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } - + + for(unsigned int number = 0; number < num_points; number++) + { + *cPtr++ = (*aPtr++) * (*bPtr++); + } + } #endif /* LV_HAVE_GENERIC */ @@ -338,7 +342,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector */ extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points); -static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points) +{ volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h index 134f24d99..490d4657d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32. * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that performs the carrier wipe-off mixing and the @@ -31,7 +31,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -57,6 +57,7 @@ #include #include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" #include "CommonMacros/CommonMacros.h" + /*! \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation \param input The input signal input @@ -72,101 +73,101 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + __m128 E_code_acc, P_code_acc, L_code_acc; __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; __m128 output_ps; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* E_code_ptr = E_code; lv_32fc_t* E_out_ptr = E_out; const lv_8sc_t* L_code_ptr = L_code; lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* P_code_ptr = P_code; lv_32fc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + E_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_lddqu_si128((__m128i*)input_ptr); - y = _mm_lddqu_si128((__m128i*)carrier_ptr); - - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) - - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get early values - y = _mm_lddqu_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - E_code_acc = _mm_add_ps (E_code_acc, output_ps); - - //Get prompt values - y = _mm_lddqu_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - P_code_acc = _mm_add_ps (P_code_acc, output_ps); - - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - L_code_acc = _mm_add_ps (L_code_acc, output_ps); - - input_ptr += 8; - carrier_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - - _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<2; ++i) { - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -189,104 +190,104 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + __m128 E_code_acc, P_code_acc, L_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 output_ps_1, output_ps_2; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* E_code_ptr = E_code; lv_32fc_t* E_out_ptr = E_out; const lv_8sc_t* L_code_ptr = L_code; lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* P_code_ptr = P_code; lv_32fc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + E_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); - - if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_loadu_si128((__m128i*)input_ptr); - y = _mm_loadu_si128((__m128i*)carrier_ptr); - - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) - - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get early values - y = _mm_loadu_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); - E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); - - //Get prompt values - y = _mm_loadu_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); - P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); - - //Get late values - y = _mm_loadu_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); - L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); - input_ptr += 8; - carrier_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - - _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - - for (unsigned int i = 0; i<2; ++i) + if (sse_iters>0) { - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_loadu_si128((__m128i*)input_ptr); + y = _mm_loadu_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y = _mm_loadu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_loadu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_loadu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (unsigned int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE2 */ @@ -306,22 +307,22 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_o static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { lv_8sc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *E_out = 0; *P_out = 0; *L_out = 0; // perform Early, Prompt and Late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get early, late, and prompt values for each - *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); - *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); - *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + } } #endif /* LV_HAVE_GENERIC */ @@ -357,101 +358,101 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + __m128 E_code_acc, P_code_acc, L_code_acc; __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; __m128 output_ps; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* E_code_ptr = E_code; lv_32fc_t* E_out_ptr = E_out; const lv_8sc_t* L_code_ptr = L_code; lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* P_code_ptr = P_code; lv_32fc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + E_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); - - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) - - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - E_code_acc = _mm_add_ps (E_code_acc, output_ps); - - //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - P_code_acc = _mm_add_ps (P_code_acc, output_ps); - - //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - L_code_acc = _mm_add_ps (L_code_acc, output_ps); - - input_ptr += 8; - carrier_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - - _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<2; ++i) { - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -474,104 +475,104 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + __m128 E_code_acc, P_code_acc, L_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 output_ps_1, output_ps_2; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* E_code_ptr = E_code; lv_32fc_t* E_out_ptr = E_out; const lv_8sc_t* L_code_ptr = L_code; lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* P_code_ptr = P_code; lv_32fc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + E_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); - - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) - - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); - E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); - - //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); - P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); - - //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); - L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); - - input_ptr += 8; - carrier_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - - _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - - for (unsigned int i = 0; i<2; ++i) { - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (unsigned int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } } #endif /* LV_HAVE_SSE2 */ @@ -591,22 +592,22 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_o static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { lv_8sc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *E_out = 0; *P_out = 0; *L_out = 0; // perform Early, Prompt and Late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get early, late, and prompt values for each - *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); - *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); - *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h index 729d0d08e..a2ca4b874 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that performs the carrier wipe-off mixing and the @@ -30,7 +30,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -54,8 +54,8 @@ #ifdef LV_HAVE_SSE4_1 #include - /*! - \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + +/*! \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation \param input The input signal input \param carrier The carrier signal input \param E_code Early PRN code replica input @@ -65,30 +65,30 @@ \param P_out Early correlation output \param L_out Early correlation output \param num_points The number of complex values in vectors - */ + */ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* E_code_ptr = E_code; lv_8sc_t* E_out_ptr = E_out; const lv_8sc_t* L_code_ptr = L_code; lv_8sc_t* L_out_ptr = L_out; const lv_8sc_t* P_code_ptr = P_code; lv_8sc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + real_E_code_acc = _mm_setzero_si128(); imag_E_code_acc = _mm_setzero_si128(); real_L_code_acc = _mm_setzero_si128(); @@ -97,124 +97,124 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_o imag_P_code_acc = _mm_setzero_si128(); if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_lddqu_si128((__m128i*)input_ptr); - y = _mm_lddqu_si128((__m128i*)carrier_ptr); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - //Get early values - y = _mm_lddqu_si128((__m128i*)E_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); - imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); - - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); - imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); - - //Get prompt values - y = _mm_lddqu_si128((__m128i*)P_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); - imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); - - input_ptr += 8; - carrier_ptr += 8; - E_code_ptr += 8; - L_code_ptr += 8; - P_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; - - imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); - output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); - _mm_storeu_si128((__m128i*)E_dotProductVector, output); - - imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); - output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); - _mm_storeu_si128((__m128i*)L_dotProductVector, output); - - imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); - output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); - _mm_storeu_si128((__m128i*)P_dotProductVector, output); - - for (int i = 0; i<8; ++i) { - *E_out_ptr += E_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); + imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); + imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); + imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + L_code_ptr += 8; + P_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; + + imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); + output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); + _mm_storeu_si128((__m128i*)E_dotProductVector, output); + + imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); + output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); + _mm_storeu_si128((__m128i*)L_dotProductVector, output); + + imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); + output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); + _mm_storeu_si128((__m128i*)P_dotProductVector, output); + + for (int i = 0; i<8; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += bb_signal_sample * (*E_code_ptr++); - *P_out_ptr += bb_signal_sample * (*P_code_ptr++); - *L_out_ptr += bb_signal_sample * (*L_code_ptr++); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += bb_signal_sample * (*E_code_ptr++); + *P_out_ptr += bb_signal_sample * (*P_code_ptr++); + *L_out_ptr += bb_signal_sample * (*L_code_ptr++); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -236,158 +236,158 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_o static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* E_code_ptr = E_code; lv_8sc_t* E_out_ptr = E_out; const lv_8sc_t* L_code_ptr = L_code; lv_8sc_t* L_out_ptr = L_out; const lv_8sc_t* P_code_ptr = P_code; lv_8sc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + real_E_code_acc = _mm_setzero_si128(); imag_E_code_acc = _mm_setzero_si128(); real_L_code_acc = _mm_setzero_si128(); imag_L_code_acc = _mm_setzero_si128(); real_P_code_acc = _mm_setzero_si128(); imag_P_code_acc = _mm_setzero_si128(); - + if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_loadu_si128((__m128i*)input_ptr); - y = _mm_loadu_si128((__m128i*)carrier_ptr); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - //Get early values - y = _mm_loadu_si128((__m128i*)E_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); - imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); - - //Get late values - y = _mm_loadu_si128((__m128i*)L_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); - imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); - - //Get prompt values - y = _mm_loadu_si128((__m128i*)P_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); - imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); - - input_ptr += 8; - carrier_ptr += 8; - E_code_ptr += 8; - L_code_ptr += 8; - P_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; - - real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); - imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); - imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); - output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); - _mm_storeu_si128((__m128i*)E_dotProductVector, output); - - real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); - imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); - imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); - output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); - _mm_storeu_si128((__m128i*)L_dotProductVector, output); - - real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); - imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); - imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); - output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); - _mm_storeu_si128((__m128i*)P_dotProductVector, output); - - for (int i = 0; i<8; ++i) { - *E_out_ptr += E_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_loadu_si128((__m128i*)input_ptr); + y = _mm_loadu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_loadu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); + imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); + + //Get late values + y = _mm_loadu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); + imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); + + //Get prompt values + y = _mm_loadu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); + imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + L_code_ptr += 8; + P_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; + + real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); + imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); + imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); + output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); + _mm_storeu_si128((__m128i*)E_dotProductVector, output); + + real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); + imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); + imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); + output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); + _mm_storeu_si128((__m128i*)L_dotProductVector, output); + + real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); + imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); + imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); + output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); + _mm_storeu_si128((__m128i*)P_dotProductVector, output); + + for (int i = 0; i<8; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += bb_signal_sample * (*E_code_ptr++); - *P_out_ptr += bb_signal_sample * (*P_code_ptr++); - *L_out_ptr += bb_signal_sample * (*L_code_ptr++); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += bb_signal_sample * (*E_code_ptr++); + *P_out_ptr += bb_signal_sample * (*P_code_ptr++); + *L_out_ptr += bb_signal_sample * (*L_code_ptr++); + } } #endif /* LV_HAVE_SSE2 */ @@ -408,22 +408,22 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { lv_8sc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *E_out = 0; *P_out = 0; *L_out = 0; // perform Early, Prompt and Late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get early, late, and prompt values for each - *E_out += bb_signal_sample * E_code[i]; - *P_out += bb_signal_sample * P_code[i]; - *L_out += bb_signal_sample * L_code[i]; - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + } } #endif /* LV_HAVE_GENERIC */ @@ -457,152 +457,152 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_ou static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* E_code_ptr = E_code; lv_8sc_t* E_out_ptr = E_out; const lv_8sc_t* L_code_ptr = L_code; lv_8sc_t* L_out_ptr = L_out; const lv_8sc_t* P_code_ptr = P_code; lv_8sc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + real_E_code_acc = _mm_setzero_si128(); imag_E_code_acc = _mm_setzero_si128(); real_L_code_acc = _mm_setzero_si128(); imag_L_code_acc = _mm_setzero_si128(); real_P_code_acc = _mm_setzero_si128(); imag_P_code_acc = _mm_setzero_si128(); - + if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); - imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); - - //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); - imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); - - //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); - imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); - - input_ptr += 8; - carrier_ptr += 8; - E_code_ptr += 8; - L_code_ptr += 8; - P_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; - - imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); - output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); - _mm_store_si128((__m128i*)E_dotProductVector, output); - - imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); - output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); - _mm_store_si128((__m128i*)L_dotProductVector, output); - - imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); - output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); - _mm_store_si128((__m128i*)P_dotProductVector, output); - - for (int i = 0; i<8; ++i) { - *E_out_ptr += E_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); + imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); + imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); + imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + L_code_ptr += 8; + P_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; + + imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); + output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); + _mm_store_si128((__m128i*)E_dotProductVector, output); + + imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); + output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); + _mm_store_si128((__m128i*)L_dotProductVector, output); + + imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); + output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); + _mm_store_si128((__m128i*)P_dotProductVector, output); + + for (int i = 0; i<8; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += bb_signal_sample * (*E_code_ptr++); - *P_out_ptr += bb_signal_sample * (*P_code_ptr++); - *L_out_ptr += bb_signal_sample * (*L_code_ptr++); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += bb_signal_sample * (*E_code_ptr++); + *P_out_ptr += bb_signal_sample * (*P_code_ptr++); + *L_out_ptr += bb_signal_sample * (*L_code_ptr++); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -624,158 +624,158 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_o static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* E_code_ptr = E_code; lv_8sc_t* E_out_ptr = E_out; const lv_8sc_t* L_code_ptr = L_code; lv_8sc_t* L_out_ptr = L_out; const lv_8sc_t* P_code_ptr = P_code; lv_8sc_t* P_out_ptr = P_out; - + *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + real_E_code_acc = _mm_setzero_si128(); imag_E_code_acc = _mm_setzero_si128(); real_L_code_acc = _mm_setzero_si128(); imag_L_code_acc = _mm_setzero_si128(); real_P_code_acc = _mm_setzero_si128(); imag_P_code_acc = _mm_setzero_si128(); - + if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); - imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); - - //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); - imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); - - //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); - imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); - - input_ptr += 8; - carrier_ptr += 8; - E_code_ptr += 8; - L_code_ptr += 8; - P_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; - - real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); - imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); - imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); - output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); - _mm_store_si128((__m128i*)E_dotProductVector, output); - - real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); - imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); - imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); - output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); - _mm_store_si128((__m128i*)L_dotProductVector, output); - - real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); - imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); - imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); - output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); - _mm_store_si128((__m128i*)P_dotProductVector, output); - - for (unsigned int i = 0; i<8; ++i) { - *E_out_ptr += E_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); + imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); + imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); + imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + L_code_ptr += 8; + P_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; + + real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); + imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); + imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); + output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); + _mm_store_si128((__m128i*)E_dotProductVector, output); + + real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); + imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); + imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); + output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); + _mm_store_si128((__m128i*)L_dotProductVector, output); + + real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); + imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); + imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); + output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); + _mm_store_si128((__m128i*)P_dotProductVector, output); + + for (unsigned int i = 0; i<8; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get early, late, and prompt values for each - *E_out_ptr += bb_signal_sample * (*E_code_ptr++); - *P_out_ptr += bb_signal_sample * (*P_code_ptr++); - *L_out_ptr += bb_signal_sample * (*L_code_ptr++); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += bb_signal_sample * (*E_code_ptr++); + *P_out_ptr += bb_signal_sample * (*P_code_ptr++); + *L_out_ptr += bb_signal_sample * (*L_code_ptr++); + } } #endif /* LV_HAVE_SSE2 */ @@ -796,22 +796,22 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { lv_8sc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *E_out = 0; *P_out = 0; *L_out = 0; // perform Early, Prompt and Late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get early, late, and prompt values for each - *E_out += bb_signal_sample * E_code[i]; - *P_out += bb_signal_sample * P_code[i]; - *L_out += bb_signal_sample * L_code[i]; - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + } } #endif /* LV_HAVE_GENERIC */ @@ -833,21 +833,21 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_ extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points); extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points); static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){ - + short E_out_real = 0; short E_out_imag = 0; char* E_out_real_c = (char*)&E_out_real; E_out_real_c++; char* E_out_imag_c = (char*)&E_out_imag; E_out_imag_c++; - + short P_out_real = 0; short P_out_imag = 0; char* P_out_real_c = (char*)&P_out_real; P_out_real_c++; char* P_out_imag_c = (char*)&P_out_imag; P_out_imag_c++; - + short L_out_real = 0; short L_out_imag = 0; char* L_out_real_c = (char*)&L_out_real; @@ -857,14 +857,14 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points); volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points); - + //ORC implementation of 8ic_x5_cw_epl_corr_8ic_x3 is done in two different functions because it seems that //in one function the length of the code gives memory problems (bad access, segmentation fault). //Also, the maximum number of accumulators that can be used is 4 (and we need 6). //The "carrier wipe-off" step is done two times: one in the first function and another one in the second. //Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just //one time. - + *E_out = lv_cmake(*E_out_real_c, *E_out_imag_c); *P_out = lv_cmake(*P_out_real_c, *P_out_imag_c); *L_out = lv_cmake(*L_out_real_c, *L_out_imag_c); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h index 57f8ab109..e76c67a5b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 (2 bits). * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that performs the carrier wipe-off mixing and the @@ -50,7 +50,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -95,17 +95,17 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; __m128 output_ps; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -116,112 +116,112 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + *VE_out_ptr = 0; *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; *VL_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + VE_code_acc = _mm_setzero_ps(); E_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps(); - - if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_lddqu_si128((__m128i*)input_ptr); - y = _mm_lddqu_si128((__m128i*)carrier_ptr); - - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) - - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get very early values - y = _mm_lddqu_si128((__m128i*)VE_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); - - //Get early values - y = _mm_lddqu_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - E_code_acc = _mm_add_ps (E_code_acc, output_ps); - - //Get prompt values - y = _mm_lddqu_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - P_code_acc = _mm_add_ps (P_code_acc, output_ps); - - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - L_code_acc = _mm_add_ps (L_code_acc, output_ps); - - //Get very late values - y = _mm_lddqu_si128((__m128i*)VL_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; - - _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<2; ++i) + if (sse_iters>0) { - *VE_out_ptr += VE_dotProductVector[i]; - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *VL_out_ptr += VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -248,17 +248,17 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 output_ps_1, output_ps_2; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -269,117 +269,117 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + *VE_out_ptr = 0; *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; *VL_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + VE_code_acc = _mm_setzero_ps(); E_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps(); - - if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_loadu_si128((__m128i*)input_ptr); - y = _mm_loadu_si128((__m128i*)carrier_ptr); - - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) - - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get very early values - y = _mm_loadu_si128((__m128i*)VE_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); - - //Get early values - y = _mm_loadu_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); - E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); - - //Get prompt values - y = _mm_loadu_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); - P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); - - //Get late values - y = _mm_loadu_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); - L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); - - //Get very late values - y = _mm_loadu_si128((__m128i*)VL_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; - - _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector - - for (unsigned int i = 0; i<2; ++i) + if (sse_iters>0) { - *VE_out_ptr += VE_dotProductVector[i]; - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *VL_out_ptr += VL_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_loadu_si128((__m128i*)input_ptr); + y = _mm_loadu_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y = _mm_loadu_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); + + //Get early values + y = _mm_loadu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_loadu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_loadu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + //Get very late values + y = _mm_loadu_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (unsigned int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE2 */ @@ -403,9 +403,9 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { lv_8sc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *VE_out = 0; *E_out = 0; *P_out = 0; @@ -413,16 +413,16 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* V *VL_out = 0; // perform very early, Early, Prompt, Late and very late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; - *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); - *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); - *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); - *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); - *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); - } + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); + } } #endif /* LV_HAVE_GENERIC */ @@ -462,17 +462,17 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* V static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; __m128 output_ps; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -483,112 +483,112 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + *VE_out_ptr = 0; *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; *VL_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + VE_code_acc = _mm_setzero_ps(); E_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps(); - - if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); - - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) - - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get very early values - y = _mm_load_si128((__m128i*)VE_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); - - //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - E_code_acc = _mm_add_ps (E_code_acc, output_ps); - - //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - P_code_acc = _mm_add_ps (P_code_acc, output_ps); - - //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - L_code_acc = _mm_add_ps (L_code_acc, output_ps); - - //Get very late values - y = _mm_load_si128((__m128i*)VL_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) - - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; - - _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<2; ++i) + if (sse_iters>0) { - *VE_out_ptr += VE_dotProductVector[i]; - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *VL_out_ptr += VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps); + + //Get very late values + y = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -615,17 +615,17 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 output_ps_1, output_ps_2; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -636,117 +636,117 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + *VE_out_ptr = 0; *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; *VL_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + VE_code_acc = _mm_setzero_ps(); E_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(unsigned int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); - - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) - CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) - - CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) - - //Get very early values - y = _mm_load_si128((__m128i*)VE_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); - - //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); - E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); - - //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); - P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); - - //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); - L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); - - //Get very late values - y = _mm_load_si128((__m128i*)VL_code_ptr); - - CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) - - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; - - _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector - - for (unsigned int i = 0; i<2; ++i) { - *VE_out_ptr += VE_dotProductVector[i]; - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *VL_out_ptr += VL_dotProductVector[i]; + for(unsigned int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + //Get very late values + y = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (unsigned int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(unsigned int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE2 */ @@ -770,9 +770,9 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { lv_8sc_t bb_signal_sample; - + bb_signal_sample = lv_cmake(0, 0); - + *VE_out = 0; *E_out = 0; *P_out = 0; @@ -780,16 +780,16 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* *VL_out = 0; // perform very early, Early, Prompt, Late and very late correlation for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - - *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); - *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); - *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); - *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); - *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); + } } #endif /* LV_HAVE_GENERIC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h index c6110b97c..8b4b91cb0 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that performs the carrier wipe-off mixing and the @@ -35,7 +35,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -80,17 +80,17 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 output_ps_1, output_ps_2; - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -101,263 +101,263 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first( lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + *VE_out_ptr = 0; *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; *VL_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + VE_code_acc = _mm_setzero_ps(); E_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps(); - - if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_lddqu_si128((__m128i*)input_ptr); - y = _mm_lddqu_si128((__m128i*)carrier_ptr); - - imagx = _mm_srli_si128 (x, 1); - imagx = _mm_and_si128 (imagx, mult1); - realx = _mm_and_si128 (x, mult1); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (realx, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); - realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); - - real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - //Get very early values - y = _mm_lddqu_si128((__m128i*)VE_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); - - //Get early values - y = _mm_lddqu_si128((__m128i*)E_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - - E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); - E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); - - //Get prompt values - y = _mm_lddqu_si128((__m128i*)P_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - - P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); - P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); - - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - - L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); - L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); - - //Get very late values - y = _mm_lddqu_si128((__m128i*)VL_code_ptr); - - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; - - _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<2; ++i) + if (sse_iters>0) { - *VE_out_ptr += VE_dotProductVector[i]; - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *VL_out_ptr += VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -384,19 +384,19 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first( static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; __m128i mult1, output, real_output, imag_output; - + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 output_ps_1, output_ps_2; - + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -407,242 +407,242 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + *VE_out_ptr = 0; *E_out_ptr = 0; *P_out_ptr = 0; *L_out_ptr = 0; *VL_out_ptr = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + VE_code_acc = _mm_setzero_ps(); E_code_acc = _mm_setzero_ps(); P_code_acc = _mm_setzero_ps(); L_code_acc = _mm_setzero_ps(); VL_code_acc = _mm_setzero_ps(); - - if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_lddqu_si128((__m128i*)input_ptr); - y = _mm_lddqu_si128((__m128i*)carrier_ptr); - - x_abs = _mm_abs_epi8 (x); - - y_aux = _mm_sign_epi8 (y, x); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (x_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - y_aux = _mm_sign_epi8 (y_aux, x); - imag_output = _mm_maddubs_epi16 (x_abs, y_aux); - - imag_output = _mm_slli_si128 (imag_output, 1); - bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); - - bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); - - //Get very early values - y = _mm_lddqu_si128((__m128i*)VE_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); - VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); - - //Get early values - y = _mm_lddqu_si128((__m128i*)E_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - - E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); - E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); - - //Get prompt values - y = _mm_lddqu_si128((__m128i*)P_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - - P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); - P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); - - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - - L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); - L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); - - //Get very late values - y = _mm_lddqu_si128((__m128i*)VL_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - imag_output = _mm_slli_si128 (imag_output, 1); - output = _mm_blendv_epi8 (imag_output, real_output, mult1); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_1 = _mm_cvtepi32_ps(output_i32); - - input_i_1 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - input_i_2 = _mm_cvtepi8_epi32(output); - output = _mm_srli_si128 (output, 4); - output_i32 = _mm_add_epi32 (input_i_1, input_i_2); - output_ps_2 = _mm_cvtepi32_ps(output_i32); - - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); - VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; - - _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<2; ++i) + if (sse_iters>0) { - *VE_out_ptr += VE_dotProductVector[i]; - *E_out_ptr += E_dotProductVector[i]; - *P_out_ptr += P_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; - *VL_out_ptr += VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + y_aux = _mm_sign_epi8 (y, x); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (x_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, x); + imag_output = _mm_maddubs_epi16 (x_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } } - } - + lv_8sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -669,20 +669,20 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; __m128i mult1, real_output, imag_output; - + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; __m128 real_output_ps, imag_output_ps; - + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); __m128i rearrange_sequence = _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -693,7 +693,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third( lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + float VE_out_real = 0; float VE_out_imag = 0; float E_out_real = 0; @@ -704,9 +704,9 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third( float L_out_imag = 0; float VL_out_real = 0; float VL_out_imag = 0; - + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + real_VE_code_acc = _mm_setzero_ps(); imag_VE_code_acc = _mm_setzero_ps(); real_E_code_acc = _mm_setzero_ps(); @@ -717,222 +717,222 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third( imag_L_code_acc = _mm_setzero_ps(); real_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_lddqu_si128((__m128i*)input_ptr); - y = _mm_lddqu_si128((__m128i*)carrier_ptr); - - x_abs = _mm_abs_epi8 (x); - - y_aux = _mm_sign_epi8 (y, x); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (x_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, x); - imag_output = _mm_maddubs_epi16 (x_abs, y_aux); - - imag_output = _mm_slli_si128 (imag_output, 1); - bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); - bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); - - //Get very early values - y = _mm_lddqu_si128((__m128i*)VE_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); - imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); - - //Get early values - y = _mm_lddqu_si128((__m128i*)E_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y = _mm_lddqu_si128((__m128i*)P_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - //Get very late values - y = _mm_lddqu_si128((__m128i*)VL_code_ptr); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); - imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; - - _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) { - VE_out_real += real_VE_dotProductVector[i]; - VE_out_imag += imag_VE_dotProductVector[i]; - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; - VL_out_real += real_VL_dotProductVector[i]; - VL_out_imag += imag_VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + y_aux = _mm_sign_epi8 (y, x); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (x_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, x); + imag_output = _mm_maddubs_epi16 (x_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); } - *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); - } lv_16sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -959,22 +959,22 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third( static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; __m128i real_output, imag_output; __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; __m128 real_output_ps, imag_output_ps; __m128i minus128control; - + __m128i minus128 = _mm_set1_epi8 (-128); __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -985,7 +985,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + float VE_out_real = 0; float VE_out_imag = 0; float E_out_real = 0; @@ -996,7 +996,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth float L_out_imag = 0; float VL_out_real = 0; float VL_out_imag = 0; - + real_VE_code_acc = _mm_setzero_ps(); imag_VE_code_acc = _mm_setzero_ps(); real_E_code_acc = _mm_setzero_ps(); @@ -1007,233 +1007,233 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth imag_L_code_acc = _mm_setzero_ps(); real_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps(); - - if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_lddqu_si128((__m128i*)input_ptr); - y = _mm_lddqu_si128((__m128i*)carrier_ptr); - - x_abs = _mm_abs_epi8 (x); - - y_aux = _mm_sign_epi8 (y, x); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (x_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, x); - imag_output = _mm_maddubs_epi16 (x_abs, y_aux); - - imag_output = _mm_slli_si128 (imag_output, 1); - bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); - bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); - - //Get very early values - y = _mm_lddqu_si128((__m128i*)VE_code_ptr); - minus128control = _mm_cmpeq_epi8 (y, minus128); - y = _mm_sub_epi8 (y, minus128control); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); - imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); - - //Get early values - y = _mm_lddqu_si128((__m128i*)E_code_ptr); - minus128control = _mm_cmpeq_epi8 (y, minus128); - y = _mm_sub_epi8 (y, minus128control); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y = _mm_lddqu_si128((__m128i*)P_code_ptr); - minus128control = _mm_cmpeq_epi8 (y, minus128); - y = _mm_sub_epi8 (y, minus128control); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); - minus128control = _mm_cmpeq_epi8 (y, minus128); - y = _mm_sub_epi8 (y, minus128control); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - //Get very late values - y = _mm_lddqu_si128((__m128i*)VL_code_ptr); - minus128control = _mm_cmpeq_epi8 (y, minus128); - y = _mm_sub_epi8 (y, minus128control); - - y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); - y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); - real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); - y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); - imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); - - real_output_i_1 = _mm_cvtepi16_epi32(real_output); - real_output = _mm_srli_si128 (real_output, 8); - real_output_i_2 = _mm_cvtepi16_epi32(real_output); - real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); - real_output_ps = _mm_cvtepi32_ps(real_output_i32); - - imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); - imag_output = _mm_srli_si128 (imag_output, 8); - imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); - imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); - imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); - - real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); - imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; - - _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) + if (sse_iters>0) { - VE_out_real += real_VE_dotProductVector[i]; - VE_out_imag += imag_VE_dotProductVector[i]; - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; - VL_out_real += real_VL_dotProductVector[i]; - VL_out_imag += imag_VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + y_aux = _mm_sign_epi8 (y, x); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (x_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, x); + imag_output = _mm_maddubs_epi16 (x_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); } - *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); - } - + lv_16sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -1264,76 +1264,76 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc *L_out = 0; *VL_out = 0; - + lv_16sc_t VE_code_value; lv_16sc_t E_code_value; lv_16sc_t P_code_value; lv_16sc_t L_code_value; lv_16sc_t VL_code_value; lv_16sc_t bb_signal_sample; - + for(unsigned int i=0; i < num_points; ++i) - { - VE_code_value = VE_code[i]; - E_code_value = E_code[i]; - P_code_value = P_code[i]; - L_code_value = L_code[i]; - VL_code_value = VL_code[i]; - - if(lv_creal(VE_code_value) == -128) { - VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + VE_code_value = VE_code[i]; + E_code_value = E_code[i]; + P_code_value = P_code[i]; + L_code_value = L_code[i]; + VL_code_value = VL_code[i]; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + if(lv_creal(VL_code_value) == -128) + { + VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); + } + if(lv_cimag(VL_code_value) == -128) + { + VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); } - if(lv_cimag(VE_code_value) == -128) - { - VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); - } - - if(lv_creal(E_code_value) == -128) - { - E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); - } - if(lv_cimag(E_code_value) == -128) - { - E_code_value = lv_cmake(lv_creal(E_code_value), -127); - } - - if(lv_creal(P_code_value) == -128) - { - P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); - } - if(lv_cimag(P_code_value) == -128) - { - P_code_value = lv_cmake(lv_creal(P_code_value), -127); - } - - if(lv_creal(L_code_value) == -128) - { - L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); - } - if(lv_cimag(L_code_value) == -128) - { - L_code_value = lv_cmake(lv_creal(L_code_value), -127); - } - - if(lv_creal(VL_code_value) == -128) - { - VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); - } - if(lv_cimag(VL_code_value) == -128) - { - VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); - } - - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get very early, early, prompt, late and very late values for each - *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); - *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); - *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); - *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); - *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); - } } #endif /* LV_HAVE_GENERIC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h index ada8cb735..6097b874f 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that performs the carrier wipe-off mixing and the @@ -47,7 +47,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -92,22 +92,22 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; __m128i real_output, imag_output; __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 real_output_ps, imag_output_ps; __m128i minus128control; - + __m128i minus128 = _mm_set1_epi8 (-128); __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -118,7 +118,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32f lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + float VE_out_real = 0; float VE_out_imag = 0; float E_out_real = 0; @@ -129,7 +129,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32f float L_out_imag = 0; float VL_out_real = 0; float VL_out_imag = 0; - + real_VE_code_acc = _mm_setzero_ps(); imag_VE_code_acc = _mm_setzero_ps(); real_E_code_acc = _mm_setzero_ps(); @@ -140,177 +140,177 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32f imag_L_code_acc = _mm_setzero_ps(); real_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_lddqu_si128((__m128i*)input_ptr); - y = _mm_lddqu_si128((__m128i*)carrier_ptr); - - x_abs = _mm_abs_epi8 (x); - - CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) - - imag_output = _mm_slli_si128 (imag_output, 1); - bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); - bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); - - //Get very early values - y = _mm_lddqu_si128((__m128i*)VE_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); - imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); - - //Get early values - y = _mm_lddqu_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y = _mm_lddqu_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - //Get very late values - y = _mm_lddqu_si128((__m128i*)VL_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); - imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; - - _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) { - VE_out_real += real_VE_dotProductVector[i]; - VE_out_imag += imag_VE_dotProductVector[i]; - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; - VL_out_real += real_VL_dotProductVector[i]; - VL_out_imag += imag_VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); } - *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); - } - + if(num_points%8!=0) - { - lv_16sc_t bb_signal_sample; - lv_16sc_t VE_code_value; - lv_16sc_t E_code_value; - lv_16sc_t P_code_value; - lv_16sc_t L_code_value; - lv_16sc_t VL_code_value; - - for(int i=0; i < num_points%8; ++i) { - VE_code_value = *VE_code_ptr++; - E_code_value = *E_code_ptr++; - P_code_value = *P_code_ptr++; - L_code_value = *L_code_ptr++; - VL_code_value = *VL_code_ptr++; - - if(lv_creal(VE_code_value) == -128) - { - VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); - } - if(lv_cimag(VE_code_value) == -128) - { - VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); - } - - if(lv_creal(E_code_value) == -128) - { - E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); - } - if(lv_cimag(E_code_value) == -128) - { - E_code_value = lv_cmake(lv_creal(E_code_value), -127); - } - - if(lv_creal(P_code_value) == -128) - { - P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); - } - if(lv_cimag(P_code_value) == -128) - { - P_code_value = lv_cmake(lv_creal(P_code_value), -127); - } - - if(lv_creal(L_code_value) == -128) - { - L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); - } - if(lv_cimag(L_code_value) == -128) - { - L_code_value = lv_cmake(lv_creal(L_code_value), -127); - } - - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); + lv_16sc_t bb_signal_sample; + lv_16sc_t VE_code_value; + lv_16sc_t E_code_value; + lv_16sc_t P_code_value; + lv_16sc_t L_code_value; + lv_16sc_t VL_code_value; + + for(int i=0; i < num_points%8; ++i) + { + VE_code_value = *VE_code_ptr++; + E_code_value = *E_code_ptr++; + P_code_value = *P_code_ptr++; + L_code_value = *L_code_ptr++; + VL_code_value = *VL_code_ptr++; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); + } } - } } #endif /* LV_HAVE_SSE4_1 */ @@ -339,76 +339,76 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc *P_out = 0; *L_out = 0; *VL_out = 0; - + lv_16sc_t VE_code_value; lv_16sc_t E_code_value; lv_16sc_t P_code_value; lv_16sc_t L_code_value; lv_16sc_t VL_code_value; lv_16sc_t bb_signal_sample; - + for(unsigned int i=0; i < num_points; ++i) - { - VE_code_value = VE_code[i]; - E_code_value = E_code[i]; - P_code_value = P_code[i]; - L_code_value = L_code[i]; - VL_code_value = VL_code[i]; - - if(lv_creal(VE_code_value) == -128) { - VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + VE_code_value = VE_code[i]; + E_code_value = E_code[i]; + P_code_value = P_code[i]; + L_code_value = L_code[i]; + VL_code_value = VL_code[i]; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + if(lv_creal(VL_code_value) == -128) + { + VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); + } + if(lv_cimag(VL_code_value) == -128) + { + VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); } - if(lv_cimag(VE_code_value) == -128) - { - VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); - } - - if(lv_creal(E_code_value) == -128) - { - E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); - } - if(lv_cimag(E_code_value) == -128) - { - E_code_value = lv_cmake(lv_creal(E_code_value), -127); - } - - if(lv_creal(P_code_value) == -128) - { - P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); - } - if(lv_cimag(P_code_value) == -128) - { - P_code_value = lv_cmake(lv_creal(P_code_value), -127); - } - - if(lv_creal(L_code_value) == -128) - { - L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); - } - if(lv_cimag(L_code_value) == -128) - { - L_code_value = lv_cmake(lv_creal(L_code_value), -127); - } - - if(lv_creal(VL_code_value) == -128) - { - VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); - } - if(lv_cimag(VL_code_value) == -128) - { - VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); - } - - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get very early, early, prompt, late and very late values for each - *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); - *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); - *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); - *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); - *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); - } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */ @@ -446,22 +446,22 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; __m128i real_output, imag_output; __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 real_output_ps, imag_output_ps; __m128i minus128control; - + __m128i minus128 = _mm_set1_epi8 (-128); __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -472,7 +472,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32f lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + float VE_out_real = 0; float VE_out_imag = 0; float E_out_real = 0; @@ -483,7 +483,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32f float L_out_imag = 0; float VL_out_real = 0; float VL_out_imag = 0; - + real_VE_code_acc = _mm_setzero_ps(); imag_VE_code_acc = _mm_setzero_ps(); real_E_code_acc = _mm_setzero_ps(); @@ -494,177 +494,177 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32f imag_L_code_acc = _mm_setzero_ps(); real_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); - - x_abs = _mm_abs_epi8 (x); - - CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) - - imag_output = _mm_slli_si128 (imag_output, 1); - bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); - bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); - - //Get very early values - y = _mm_load_si128((__m128i*)VE_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); - imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); - - //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - //Get very late values - y = _mm_load_si128((__m128i*)VL_code_ptr); - - CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); - imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; - - _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) { - VE_out_real += real_VE_dotProductVector[i]; - VE_out_imag += imag_VE_dotProductVector[i]; - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; - VL_out_real += real_VL_dotProductVector[i]; - VL_out_imag += imag_VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); } - *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); - } - + if(num_points%8!=0) - { - lv_16sc_t bb_signal_sample; - lv_16sc_t VE_code_value; - lv_16sc_t E_code_value; - lv_16sc_t P_code_value; - lv_16sc_t L_code_value; - lv_16sc_t VL_code_value; - - for(int i=0; i < num_points%8; ++i) { - VE_code_value = *VE_code_ptr++; - E_code_value = *E_code_ptr++; - P_code_value = *P_code_ptr++; - L_code_value = *L_code_ptr++; - VL_code_value = *VL_code_ptr++; - - if(lv_creal(VE_code_value) == -128) - { - VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); - } - if(lv_cimag(VE_code_value) == -128) - { - VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); - } - - if(lv_creal(E_code_value) == -128) - { - E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); - } - if(lv_cimag(E_code_value) == -128) - { - E_code_value = lv_cmake(lv_creal(E_code_value), -127); - } - - if(lv_creal(P_code_value) == -128) - { - P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); - } - if(lv_cimag(P_code_value) == -128) - { - P_code_value = lv_cmake(lv_creal(P_code_value), -127); - } - - if(lv_creal(L_code_value) == -128) - { - L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); - } - if(lv_cimag(L_code_value) == -128) - { - L_code_value = lv_cmake(lv_creal(L_code_value), -127); - } - - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); + lv_16sc_t bb_signal_sample; + lv_16sc_t VE_code_value; + lv_16sc_t E_code_value; + lv_16sc_t P_code_value; + lv_16sc_t L_code_value; + lv_16sc_t VL_code_value; + + for(int i=0; i < num_points%8; ++i) + { + VE_code_value = *VE_code_ptr++; + E_code_value = *E_code_ptr++; + P_code_value = *P_code_ptr++; + L_code_value = *L_code_ptr++; + VL_code_value = *VL_code_ptr++; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); + } } - } } #endif /* LV_HAVE_SSE4_1 */ @@ -693,76 +693,76 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32 *P_out = 0; *L_out = 0; *VL_out = 0; - + lv_16sc_t VE_code_value; lv_16sc_t E_code_value; lv_16sc_t P_code_value; lv_16sc_t L_code_value; lv_16sc_t VL_code_value; lv_16sc_t bb_signal_sample; - + for(unsigned int i=0; i < num_points; ++i) - { - VE_code_value = VE_code[i]; - E_code_value = E_code[i]; - P_code_value = P_code[i]; - L_code_value = L_code[i]; - VL_code_value = VL_code[i]; - - if(lv_creal(VE_code_value) == -128) { - VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + VE_code_value = VE_code[i]; + E_code_value = E_code[i]; + P_code_value = P_code[i]; + L_code_value = L_code[i]; + VL_code_value = VL_code[i]; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + if(lv_creal(VL_code_value) == -128) + { + VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); + } + if(lv_cimag(VL_code_value) == -128) + { + VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); } - if(lv_cimag(VE_code_value) == -128) - { - VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); - } - - if(lv_creal(E_code_value) == -128) - { - E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); - } - if(lv_cimag(E_code_value) == -128) - { - E_code_value = lv_cmake(lv_creal(E_code_value), -127); - } - - if(lv_creal(P_code_value) == -128) - { - P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); - } - if(lv_cimag(P_code_value) == -128) - { - P_code_value = lv_cmake(lv_creal(P_code_value), -127); - } - - if(lv_creal(L_code_value) == -128) - { - L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); - } - if(lv_cimag(L_code_value) == -128) - { - L_code_value = lv_cmake(lv_creal(L_code_value), -127); - } - - if(lv_creal(VL_code_value) == -128) - { - VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); - } - if(lv_cimag(VL_code_value) == -128) - { - VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); - } - - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get very early, early, prompt, late and very late values for each - *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); - *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); - *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); - *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); - *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); - } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h index 33bd81f89..0f755c078 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that performs the carrier wipe-off mixing and the @@ -47,7 +47,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -92,7 +92,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; __m128i real_output, imag_output; __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; @@ -102,10 +102,10 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_3 __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -116,7 +116,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_3 lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + float VE_out_real = 0; float VE_out_imag = 0; float E_out_real = 0; @@ -127,7 +127,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_3 float L_out_imag = 0; float VL_out_real = 0; float VL_out_imag = 0; - + real_VE_code_acc = _mm_setzero_ps(); imag_VE_code_acc = _mm_setzero_ps(); real_E_code_acc = _mm_setzero_ps(); @@ -138,126 +138,126 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_3 imag_L_code_acc = _mm_setzero_ps(); real_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps(); - - if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_lddqu_si128((__m128i*)input_ptr); - y = _mm_lddqu_si128((__m128i*)carrier_ptr); - - x_abs = _mm_abs_epi8 (x); - - CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) - - imag_output = _mm_slli_si128 (imag_output, 1); - bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); - bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); - - //Get very early values - y = _mm_lddqu_si128((__m128i*)VE_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); - imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); - - //Get early values - y = _mm_lddqu_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y = _mm_lddqu_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - //Get very late values - y = _mm_lddqu_si128((__m128i*)VL_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); - imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; - - _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector - _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) + if (sse_iters>0) { - VE_out_real += real_VE_dotProductVector[i]; - VE_out_imag += imag_VE_dotProductVector[i]; - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; - VL_out_real += real_VL_dotProductVector[i]; - VL_out_imag += imag_VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); } - *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); - } - + lv_16sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -288,18 +288,18 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32 *VL_out = 0; lv_16sc_t bb_signal_sample; - + for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get very early, early, prompt, late and very late values for each - *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); - *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); - *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); - *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); - *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */ @@ -337,20 +337,20 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32 static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; __m128i real_output, imag_output; __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; __m128i input_i_1, input_i_2, output_i32; __m128 real_output_ps, imag_output_ps; - + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; - + const lv_8sc_t* VE_code_ptr = VE_code; lv_32fc_t* VE_out_ptr = VE_out; const lv_8sc_t* E_code_ptr = E_code; @@ -361,7 +361,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_3 lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* VL_code_ptr = VL_code; lv_32fc_t* VL_out_ptr = VL_out; - + float VE_out_real = 0; float VE_out_imag = 0; float E_out_real = 0; @@ -372,7 +372,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_3 float L_out_imag = 0; float VL_out_real = 0; float VL_out_imag = 0; - + real_VE_code_acc = _mm_setzero_ps(); imag_VE_code_acc = _mm_setzero_ps(); real_E_code_acc = _mm_setzero_ps(); @@ -383,126 +383,126 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_3 imag_L_code_acc = _mm_setzero_ps(); real_VL_code_acc = _mm_setzero_ps(); imag_VL_code_acc = _mm_setzero_ps(); - + if (sse_iters>0) - { - for(int number = 0;number < sse_iters; number++){ - - //Perform the carrier wipe-off - x = _mm_load_si128((__m128i*)input_ptr); - y = _mm_load_si128((__m128i*)carrier_ptr); - - x_abs = _mm_abs_epi8 (x); - - CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) - - imag_output = _mm_slli_si128 (imag_output, 1); - bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); - bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); - - //Get very early values - y = _mm_load_si128((__m128i*)VE_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); - imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); - - //Get early values - y = _mm_load_si128((__m128i*)E_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); - imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); - - //Get prompt values - y = _mm_load_si128((__m128i*)P_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); - imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); - - //Get late values - y = _mm_load_si128((__m128i*)L_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); - imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); - - //Get very late values - y = _mm_load_si128((__m128i*)VL_code_ptr); - - CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) - - real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); - imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); - - input_ptr += 8; - carrier_ptr += 8; - VE_code_ptr += 8; - E_code_ptr += 8; - P_code_ptr += 8; - L_code_ptr += 8; - VL_code_ptr += 8; - } - - __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; - - _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector - _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector - - for (int i = 0; i<4; ++i) { - VE_out_real += real_VE_dotProductVector[i]; - VE_out_imag += imag_VE_dotProductVector[i]; - E_out_real += real_E_dotProductVector[i]; - E_out_imag += imag_E_dotProductVector[i]; - P_out_real += real_P_dotProductVector[i]; - P_out_imag += imag_P_dotProductVector[i]; - L_out_real += real_L_dotProductVector[i]; - L_out_imag += imag_L_dotProductVector[i]; - VL_out_real += real_VL_dotProductVector[i]; - VL_out_imag += imag_VL_dotProductVector[i]; + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); } - *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); - *E_out_ptr = lv_cmake(E_out_real, E_out_imag); - *P_out_ptr = lv_cmake(P_out_real, P_out_imag); - *L_out_ptr = lv_cmake(L_out_real, L_out_imag); - *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); - } - + lv_16sc_t bb_signal_sample; for(int i=0; i < num_points%8; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); - // Now get very early, early, prompt, late and very late values for each - *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); - *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); - *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); - *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); - *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); - } + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); + } } #endif /* LV_HAVE_SSE4_1 */ @@ -531,20 +531,20 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_ *P_out = 0; *L_out = 0; *VL_out = 0; - + lv_16sc_t bb_signal_sample; - + for(unsigned int i=0; i < num_points; ++i) - { - //Perform the carrier wipe-off - bb_signal_sample = input[i] * carrier[i]; - // Now get very early, early, prompt, late and very late values for each - *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); - *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); - *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); - *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); - *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); - } + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h index 196a81b75..9947a7aca 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h @@ -2,7 +2,7 @@ * \file volk_gnsssdr_8u_x2_multiply_8u.h * \brief Volk protokernel: multiplies unsigned char values * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* * Volk protokernel that multiplies unsigned char values (8 bits data) @@ -19,7 +19,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -41,53 +41,54 @@ #ifdef LV_HAVE_SSE3 #include /*! - \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char + \brief Multiplies the two input unsigned char values and stores their results in the third unsigned char \param cChar The unsigned char where the results will be stored \param aChar One of the unsigned char to be multiplied \param bChar One of the unsigned char to be multiplied \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar */ -static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ - +static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 16; - + __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; unsigned char* c = cChar; const unsigned char* a = aChar; const unsigned char* b = bChar; - - for(unsigned int number = 0;number < sse_iters; number++){ - x = _mm_lddqu_si128((__m128i*)a); - y = _mm_lddqu_si128((__m128i*)b); - - mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - x1 = _mm_srli_si128 (x, 1); - x1 = _mm_and_si128 (x1, mult1); - x2 = _mm_and_si128 (x, mult1); - - y1 = _mm_srli_si128 (y, 1); - y1 = _mm_and_si128 (y1, mult1); - y2 = _mm_and_si128 (y, mult1); - - x1_mult_y1 = _mm_mullo_epi16 (x1, y1); - x2_mult_y2 = _mm_mullo_epi16 (x2, y2); - - tmp = _mm_and_si128 (x1_mult_y1, mult1); - tmp1 = _mm_slli_si128 (tmp, 1); - tmp2 = _mm_and_si128 (x2_mult_y2, mult1); - totalc = _mm_or_si128 (tmp1, tmp2); - - _mm_storeu_si128((__m128i*)c, totalc); - - a += 16; - b += 16; - c += 16; - } - + + for(unsigned int number = 0;number < sse_iters; number++) + { + x = _mm_lddqu_si128((__m128i*)a); + y = _mm_lddqu_si128((__m128i*)b); + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + x1 = _mm_srli_si128 (x, 1); + x1 = _mm_and_si128 (x1, mult1); + x2 = _mm_and_si128 (x, mult1); + + y1 = _mm_srli_si128 (y, 1); + y1 = _mm_and_si128 (y1, mult1); + y2 = _mm_and_si128 (y, mult1); + + x1_mult_y1 = _mm_mullo_epi16 (x1, y1); + x2_mult_y2 = _mm_mullo_epi16 (x2, y2); + + tmp = _mm_and_si128 (x1_mult_y1, mult1); + tmp1 = _mm_slli_si128 (tmp, 1); + tmp2 = _mm_and_si128 (x2_mult_y2, mult1); + totalc = _mm_or_si128 (tmp1, tmp2); + + _mm_storeu_si128((__m128i*)c, totalc); + + a += 16; + b += 16; + c += 16; + } + for (unsigned int i = 0; i<(num_points % 16); ++i) - { - *c++ = (*a++) * (*b++); - } + { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_SSE3 */ @@ -99,14 +100,16 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c \param bChar One of the unsigned char to be multiplied \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar */ -static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ +static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points) +{ unsigned char* cPtr = cChar; const unsigned char* aPtr = aChar; const unsigned char* bPtr = bChar; - - for(unsigned int number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + + for(unsigned int number = 0; number < num_points; number++) + { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -129,47 +132,48 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, \param bChar One of the unsigned char to be multiplied \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar */ -static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ - +static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points) +{ const unsigned int sse_iters = num_points / 16; - + __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; unsigned char* c = cChar; const unsigned char* a = aChar; const unsigned char* b = bChar; - - for(unsigned int number = 0;number < sse_iters; number++){ - x = _mm_load_si128((__m128i*)a); - y = _mm_load_si128((__m128i*)b); - - mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - x1 = _mm_srli_si128 (x, 1); - x1 = _mm_and_si128 (x1, mult1); - x2 = _mm_and_si128 (x, mult1); - - y1 = _mm_srli_si128 (y, 1); - y1 = _mm_and_si128 (y1, mult1); - y2 = _mm_and_si128 (y, mult1); - - x1_mult_y1 = _mm_mullo_epi16 (x1, y1); - x2_mult_y2 = _mm_mullo_epi16 (x2, y2); - - tmp = _mm_and_si128 (x1_mult_y1, mult1); - tmp1 = _mm_slli_si128 (tmp, 1); - tmp2 = _mm_and_si128 (x2_mult_y2, mult1); - totalc = _mm_or_si128 (tmp1, tmp2); - - _mm_store_si128((__m128i*)c, totalc); - - a += 16; - b += 16; - c += 16; - } - + + for(unsigned int number = 0;number < sse_iters; number++) + { + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + x1 = _mm_srli_si128 (x, 1); + x1 = _mm_and_si128 (x1, mult1); + x2 = _mm_and_si128 (x, mult1); + + y1 = _mm_srli_si128 (y, 1); + y1 = _mm_and_si128 (y1, mult1); + y2 = _mm_and_si128 (y, mult1); + + x1_mult_y1 = _mm_mullo_epi16 (x1, y1); + x2_mult_y2 = _mm_mullo_epi16 (x2, y2); + + tmp = _mm_and_si128 (x1_mult_y1, mult1); + tmp1 = _mm_slli_si128 (tmp, 1); + tmp2 = _mm_and_si128 (x2_mult_y2, mult1); + totalc = _mm_or_si128 (tmp1, tmp2); + + _mm_store_si128((__m128i*)c, totalc); + + a += 16; + b += 16; + c += 16; + } + for (unsigned int i = 0; i<(num_points % 16); ++i) - { - *c++ = (*a++) * (*b++); - } + { + *c++ = (*a++) * (*b++); + } } #endif /* LV_HAVE_SSE */ @@ -181,14 +185,16 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c \param bChar One of the unsigned char to be multiplied \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar */ -static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ +static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points) +{ unsigned char* cPtr = cChar; const unsigned char* aPtr = aChar; const unsigned char* bPtr = bChar; - - for(unsigned int number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } + + for(unsigned int number = 0; number < num_points; number++) + { + *cPtr++ = (*aPtr++) * (*bPtr++); + } } #endif /* LV_HAVE_GENERIC */ @@ -201,7 +207,8 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar */ extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points); -static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){ +static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points) +{ volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points); } #endif /* LV_HAVE_ORC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h index 4bb91f9f0..7295d4819 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h @@ -1,11 +1,11 @@ /*! * \file volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc - * \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. + * \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andres Cecilia. * \authors
    - *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
  • Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* - * Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. + * Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andres Cecilia. * * ------------------------------------------------------------------------- * @@ -27,7 +27,7 @@ * misrepresented as being the original software. * 3. This notice may not be removed or altered from any source distribution. * - *(this is the zlib license) + * (this is the zlib license) * * ------------------------------------------------------------------------- * @@ -49,7 +49,7 @@ * GNSS-SDR is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or - * at your option) any later version. + * (at your option) any later version. * * GNSS-SDR is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -77,15 +77,15 @@ \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ - -// float* pointer1 = (float*)&phase_rad_init; -// *pointer1 = 0; -// float* pointer2 = (float*)&phase_step_rad; -// *pointer2 = 0.5; - +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points) +{ + // float* pointer1 = (float*)&phase_rad_init; + // *pointer1 = 0; + // float* pointer2 = (float*)&phase_step_rad; + // *pointer2 = 0.5; + const unsigned int sse_iters = num_points / 8; - + __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); @@ -103,178 +103,177 @@ static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); __m256 _ps256_1 = _mm256_set1_ps(1.f); __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); - + __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); - + __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; __m256 xmm1, xmm2, xmm3, sign_bit_sin; __m256i imm0, imm2, imm4, tmp256i; __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; __VOLK_ATTR_ALIGNED(32) float sin_value[8]; __VOLK_ATTR_ALIGNED(32) float cos_value[8]; - - phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); - - for(int i = 0; i < sse_iters; i++) - { - - x = phase_rad_array; - - /* extract the sign bit (upper one) */ - sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); - - /* take the absolute value */ - x = _mm256_xor_ps(x, sign_bit_sin); - - /* scale by 4/Pi */ - y = _mm256_mul_ps(x, _ps256_cephes_FOPI); - - /* we use SSE2 routines to perform the integer ops */ - - //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); - tmp256i = _mm256_cvttps_epi32(y); - imm2_1 = _mm256_extractf128_si256 (tmp256i, 0); - imm2_2 = _mm256_extractf128_si256 (tmp256i, 1); - - imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); - - imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); - - //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); - //_mm256_set_m128i not defined in some versions of immintrin.h - //imm2 = _mm256_set_m128i (imm2_2, imm2_1); - imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); - - y = _mm256_cvtepi32_ps(imm2); - - imm4_1 = imm2_1; - imm4_2 = imm2_2; - - imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); - imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); - - imm0_1 = _mm_slli_epi32(imm0_1, 29); - imm0_2 = _mm_slli_epi32(imm0_2, 29); - - //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - //_mm256_set_m128i not defined in some versions of immintrin.h - //imm0 = _mm256_set_m128i (imm0_2, imm0_1); - imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); - - imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); - - imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); - imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); - - //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); - //_mm256_set_m128i not defined in some versions of immintrin.h - //imm2 = _mm256_set_m128i (imm2_2, imm2_1); - imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); - swap_sign_bit_sin = _mm256_castsi256_ps(imm0); - poly_mask = _mm256_castsi256_ps(imm2); - - /* The magic pass: "Extended precision modular arithmetic" + phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); + + for(int i = 0; i < sse_iters; i++) + { + x = phase_rad_array; + + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); + + /* take the absolute value */ + x = _mm256_xor_ps(x, sign_bit_sin); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, _ps256_cephes_FOPI); + + /* we use SSE2 routines to perform the integer ops */ + + //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); + tmp256i = _mm256_cvttps_epi32(y); + imm2_1 = _mm256_extractf128_si256 (tmp256i, 0); + imm2_2 = _mm256_extractf128_si256 (tmp256i, 1); + + imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); + imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); + + imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); + imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); + + //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm2 = _mm256_set_m128i (imm2_2, imm2_1); + imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); + + y = _mm256_cvtepi32_ps(imm2); + + imm4_1 = imm2_1; + imm4_2 = imm2_2; + + imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); + imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); + + imm0_1 = _mm_slli_epi32(imm0_1, 29); + imm0_2 = _mm_slli_epi32(imm0_2, 29); + + //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm0 = _mm256_set_m128i (imm0_2, imm0_1); + imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); + + imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); + imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); + + imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); + imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); + + //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm2 = _mm256_set_m128i (imm2_2, imm2_1); + imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); + + swap_sign_bit_sin = _mm256_castsi256_ps(imm0); + poly_mask = _mm256_castsi256_ps(imm2); + + /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = _ps256_minus_cephes_DP1; - xmm2 = _ps256_minus_cephes_DP2; - xmm3 = _ps256_minus_cephes_DP3; - xmm1 = _mm256_mul_ps(y, xmm1); - xmm2 = _mm256_mul_ps(y, xmm2); - xmm3 = _mm256_mul_ps(y, xmm3); - x = _mm256_add_ps(x, xmm1); - x = _mm256_add_ps(x, xmm2); - x = _mm256_add_ps(x, xmm3); - - imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); - imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); - - imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); - imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); - - imm4_1 = _mm_slli_epi32(imm4_1, 29); - imm4_2 = _mm_slli_epi32(imm4_2, 29); - - //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); - //_mm256_set_m128i not defined in some versions of immintrin.h - //imm4 = _mm256_set_m128i (imm4_2, imm4_1); - imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); - - sign_bit_cos = _mm256_castsi256_ps(imm4); - - sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - z = _mm256_mul_ps(x,x); - y = _ps256_coscof_p0; - - y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, _ps256_coscof_p1); - y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, _ps256_coscof_p2); - y = _mm256_mul_ps(y, z); - y = _mm256_mul_ps(y, z); - tmp = _mm256_mul_ps(z, _ps256_0p5); - y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, _ps256_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - - y2 = _ps256_sincof_p0; - y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, _ps256_sincof_p1); - y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, _ps256_sincof_p2); - y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_mul_ps(y2, x); - y2 = _mm256_add_ps(y2, x); - - /* select the correct result from the two polynoms */ - xmm3 = poly_mask; - ysin2 = _mm256_and_ps(xmm3, y2); - ysin1 = _mm256_andnot_ps(xmm3, y); - y2 = _mm256_sub_ps(y2,ysin2); - y = _mm256_sub_ps(y, ysin1); - - xmm1 = _mm256_add_ps(ysin1,ysin2); - xmm2 = _mm256_add_ps(y,y2); - - /* update the sign */ - s = _mm256_xor_ps(xmm1, sign_bit_sin); - c = _mm256_xor_ps(xmm2, sign_bit_cos); - - //GNSS-SDR needs to return -sin - s = _mm256_xor_ps(s, _ps256_sign_mask); - - _mm256_storeu_ps ((float*)sin_value, s); - _mm256_storeu_ps ((float*)cos_value, c); - - for(int i = 0; i < 8; i++) - { - d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); + xmm1 = _ps256_minus_cephes_DP1; + xmm2 = _ps256_minus_cephes_DP2; + xmm3 = _ps256_minus_cephes_DP3; + xmm1 = _mm256_mul_ps(y, xmm1); + xmm2 = _mm256_mul_ps(y, xmm2); + xmm3 = _mm256_mul_ps(y, xmm3); + x = _mm256_add_ps(x, xmm1); + x = _mm256_add_ps(x, xmm2); + x = _mm256_add_ps(x, xmm3); + + imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); + imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); + + imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); + imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); + + imm4_1 = _mm_slli_epi32(imm4_1, 29); + imm4_2 = _mm_slli_epi32(imm4_2, 29); + + //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm4 = _mm256_set_m128i (imm4_2, imm4_1); + imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); + + sign_bit_cos = _mm256_castsi256_ps(imm4); + + sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + z = _mm256_mul_ps(x,x); + y = _ps256_coscof_p0; + + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, _ps256_coscof_p1); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, _ps256_coscof_p2); + y = _mm256_mul_ps(y, z); + y = _mm256_mul_ps(y, z); + tmp = _mm256_mul_ps(z, _ps256_0p5); + y = _mm256_sub_ps(y, tmp); + y = _mm256_add_ps(y, _ps256_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + + y2 = _ps256_sincof_p0; + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, _ps256_sincof_p1); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, _ps256_sincof_p2); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_mul_ps(y2, x); + y2 = _mm256_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + ysin2 = _mm256_and_ps(xmm3, y2); + ysin1 = _mm256_andnot_ps(xmm3, y); + y2 = _mm256_sub_ps(y2,ysin2); + y = _mm256_sub_ps(y, ysin1); + + xmm1 = _mm256_add_ps(ysin1,ysin2); + xmm2 = _mm256_add_ps(y,y2); + + /* update the sign */ + s = _mm256_xor_ps(xmm1, sign_bit_sin); + c = _mm256_xor_ps(xmm2, sign_bit_cos); + + //GNSS-SDR needs to return -sin + s = _mm256_xor_ps(s, _ps256_sign_mask); + + _mm256_storeu_ps ((float*)sin_value, s); + _mm256_storeu_ps ((float*)cos_value, c); + + for(int i = 0; i < 8; i++) + { + d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); + } + d_carr_sign += 8; + + phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); } - d_carr_sign += 8; - - phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); - } - + if (num_points%8!=0) - { - __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; - _mm256_storeu_ps ((float*)phase_rad_store, phase_rad_array); - - float phase_rad = phase_rad_store[0]; - - for(int i = 0; i < num_points%8; i++) { - *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); - d_carr_sign++; - phase_rad += phase_step_rad; + __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; + _mm256_storeu_ps ((float*)phase_rad_store, phase_rad_array); + + float phase_rad = phase_rad_store[0]; + + for(int i = 0; i < num_points%8; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } } - } } #endif /* LV_HAVE_AVX */ @@ -286,16 +285,16 @@ static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_ \param result The accumulated result \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ - -// float* pointer1 = (float*)&phase_rad_init; -// *pointer1 = 0; -// float* pointer2 = (float*)&phase_step_rad; -// *pointer2 = 0.5; - + */ +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points) +{ + // float* pointer1 = (float*)&phase_rad_init; + // *pointer1 = 0; + // float* pointer2 = (float*)&phase_step_rad; + // *pointer2 = 0.5; + const unsigned int sse_iters = num_points / 4; - + __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); @@ -313,128 +312,128 @@ static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); __m128 _ps_1 = _mm_set1_ps(1.f); __m128 _ps_0p5 = _mm_set1_ps(0.5f); - + __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); - + __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; __m128 xmm1, xmm2, xmm3, sign_bit_sin; __m128i emm0, emm2, emm4; __VOLK_ATTR_ALIGNED(16) float sin_value[4]; __VOLK_ATTR_ALIGNED(16) float cos_value[4]; - + phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); - + for(unsigned int i = 0; i < sse_iters; i++) - { - x = phase_rad_array; - - /* extract the sign bit (upper one) */ - sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); - - /* take the absolute value */ - x = _mm_xor_ps(x, sign_bit_sin); - - /* scale by 4/Pi */ - y = _mm_mul_ps(x, _ps_cephes_FOPI); - - /* store the integer part of y in emm2 */ - emm2 = _mm_cvttps_epi32(y); - - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, _pi32_1); - emm2 = _mm_and_si128(emm2, _pi32_inv1); - y = _mm_cvtepi32_ps(emm2); - - emm4 = emm2; - - /* get the swap sign flag for the sine */ - emm0 = _mm_and_si128(emm2, _pi32_4); - emm0 = _mm_slli_epi32(emm0, 29); - swap_sign_bit_sin = _mm_castsi128_ps(emm0); - - /* get the polynom selection mask for the sine*/ - emm2 = _mm_and_si128(emm2, _pi32_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - poly_mask = _mm_castsi128_ps(emm2); - - /* The magic pass: "Extended precision modular arithmetic" + { + x = phase_rad_array; + + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); + + /* take the absolute value */ + x = _mm_xor_ps(x, sign_bit_sin); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, _ps_cephes_FOPI); + + /* store the integer part of y in emm2 */ + emm2 = _mm_cvttps_epi32(y); + + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, _pi32_1); + emm2 = _mm_and_si128(emm2, _pi32_inv1); + y = _mm_cvtepi32_ps(emm2); + + emm4 = emm2; + + /* get the swap sign flag for the sine */ + emm0 = _mm_and_si128(emm2, _pi32_4); + emm0 = _mm_slli_epi32(emm0, 29); + swap_sign_bit_sin = _mm_castsi128_ps(emm0); + + /* get the polynom selection mask for the sine*/ + emm2 = _mm_and_si128(emm2, _pi32_2); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + poly_mask = _mm_castsi128_ps(emm2); + + /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); - xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); - xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); - x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); - - emm4 = _mm_sub_epi32(emm4, _pi32_2); - emm4 = _mm_andnot_si128(emm4, _pi32_4); - emm4 = _mm_slli_epi32(emm4, 29); - sign_bit_cos = _mm_castsi128_ps(emm4); - - sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - z = _mm_mul_ps(x,x); - y = _ps_coscof_p0; - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, _ps_coscof_p1); - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, _ps_coscof_p2); - y = _mm_mul_ps(y, _mm_mul_ps(z, z)); - tmp = _mm_mul_ps(z, _ps_0p5); - y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, _ps_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - y2 = _ps_sincof_p0; - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, _ps_sincof_p1); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, _ps_sincof_p2); - y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); - y2 = _mm_add_ps(y2, x); + xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); + xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); + xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); + x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); - /* select the correct result from the two polynoms */ - xmm3 = poly_mask; - ysin2 = _mm_and_ps(xmm3, y2); - ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); - y = _mm_sub_ps(y, ysin1); - - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); - - /* update the sign */ - s = _mm_xor_ps(xmm1, sign_bit_sin); - c = _mm_xor_ps(xmm2, sign_bit_cos); - - //GNSS-SDR needs to return -sin - s = _mm_xor_ps(s, _ps_sign_mask); + emm4 = _mm_sub_epi32(emm4, _pi32_2); + emm4 = _mm_andnot_si128(emm4, _pi32_4); + emm4 = _mm_slli_epi32(emm4, 29); + sign_bit_cos = _mm_castsi128_ps(emm4); - _mm_storeu_ps ((float*)sin_value, s); - _mm_storeu_ps ((float*)cos_value, c); - - for(unsigned int e = 0; e < 4; e++) - { - d_carr_sign[e] = lv_cmake(cos_value[e], sin_value[e]); + sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + z = _mm_mul_ps(x,x); + y = _ps_coscof_p0; + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _ps_coscof_p1); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _ps_coscof_p2); + y = _mm_mul_ps(y, _mm_mul_ps(z, z)); + tmp = _mm_mul_ps(z, _ps_0p5); + y = _mm_sub_ps(y, tmp); + y = _mm_add_ps(y, _ps_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + y2 = _ps_sincof_p0; + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _ps_sincof_p1); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _ps_sincof_p2); + y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); + y2 = _mm_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + ysin2 = _mm_and_ps(xmm3, y2); + ysin1 = _mm_andnot_ps(xmm3, y); + y2 = _mm_sub_ps(y2,ysin2); + y = _mm_sub_ps(y, ysin1); + + xmm1 = _mm_add_ps(ysin1,ysin2); + xmm2 = _mm_add_ps(y,y2); + + /* update the sign */ + s = _mm_xor_ps(xmm1, sign_bit_sin); + c = _mm_xor_ps(xmm2, sign_bit_cos); + + //GNSS-SDR needs to return -sin + s = _mm_xor_ps(s, _ps_sign_mask); + + _mm_storeu_ps ((float*)sin_value, s); + _mm_storeu_ps ((float*)cos_value, c); + + for(unsigned int e = 0; e < 4; e++) + { + d_carr_sign[e] = lv_cmake(cos_value[e], sin_value[e]); + } + d_carr_sign += 4; + + phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); } - d_carr_sign += 4; - - phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); - } - + if (num_points%4!=0) - { - __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; - _mm_storeu_ps ((float*)phase_rad_store, phase_rad_array); - - float phase_rad = phase_rad_store[0]; - - for(unsigned int i = 0; i < num_points%4; i++) { - *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); - d_carr_sign++; - phase_rad += phase_step_rad; + __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; + _mm_storeu_ps ((float*)phase_rad_store, phase_rad_array); + + float phase_rad = phase_rad_store[0]; + + for(unsigned int i = 0; i < num_points%4; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } } - } } #endif /* LV_HAVE_SSE2 */ @@ -444,21 +443,21 @@ static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc \param result The accumulated result \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ - -// float* pointer1 = (float*)&phase_rad_init; -// *pointer1 = 0; -// float* pointer2 = (float*)&phase_step_rad; -// *pointer2 = 0.5; - + */ +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points) +{ + // float* pointer1 = (float*)&phase_rad_init; + // *pointer1 = 0; + // float* pointer2 = (float*)&phase_step_rad; + // *pointer2 = 0.5; + float phase_rad = phase_rad_init; for(unsigned int i = 0; i < num_points; i++) - { - *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); - d_carr_sign++; - phase_rad += phase_step_rad; - } + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H */ @@ -479,15 +478,15 @@ static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32f \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ - +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points) +{ // float* pointer1 = (float*)&phase_rad_init; // *pointer1 = 0; // float* pointer2 = (float*)&phase_step_rad; // *pointer2 = 0.5; - + const unsigned int sse_iters = num_points / 8; - + __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); @@ -505,178 +504,178 @@ static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); __m256 _ps256_1 = _mm256_set1_ps(1.f); __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); - + __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); - + __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; __m256 xmm1, xmm2, xmm3, sign_bit_sin; __m256i imm0, imm2, imm4, tmp256i; __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; __VOLK_ATTR_ALIGNED(32) float sin_value[8]; __VOLK_ATTR_ALIGNED(32) float cos_value[8]; - - phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); - + + phase_rad_array = _mm256_set_ps (phase_rad_init + 7*phase_step_rad, phase_rad_init + 6*phase_step_rad, phase_rad_init + 5*phase_step_rad, phase_rad_init + 4*phase_step_rad, phase_rad_init + 3*phase_step_rad, phase_rad_init + 2*phase_step_rad, phase_rad_init + phase_step_rad, phase_rad_init); + for(int i = 0; i < sse_iters; i++) - { - - x = phase_rad_array; - - /* extract the sign bit (upper one) */ - sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); - - /* take the absolute value */ - x = _mm256_xor_ps(x, sign_bit_sin); - - /* scale by 4/Pi */ - y = _mm256_mul_ps(x, _ps256_cephes_FOPI); - - /* we use SSE2 routines to perform the integer ops */ - - //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); - tmp256i = _mm256_cvttps_epi32(y); - imm2_1 = _mm256_extractf128_si256 (tmp256i, 0); - imm2_2 = _mm256_extractf128_si256 (tmp256i, 1); - - imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); - - imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); - - //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); - //_mm256_set_m128i not defined in some versions of immintrin.h - //imm2 = _mm256_set_m128i (imm2_2, imm2_1); - imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); - - y = _mm256_cvtepi32_ps(imm2); - - imm4_1 = imm2_1; - imm4_2 = imm2_2; - - imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); - imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); - - imm0_1 = _mm_slli_epi32(imm0_1, 29); - imm0_2 = _mm_slli_epi32(imm0_2, 29); - - //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - //_mm256_set_m128i not defined in some versions of immintrin.h - //imm0 = _mm256_set_m128i (imm0_2, imm0_1); - imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); - - imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); - - imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); - imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); - - //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); - //_mm256_set_m128i not defined in some versions of immintrin.h - //imm2 = _mm256_set_m128i (imm2_2, imm2_1); - imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); - - swap_sign_bit_sin = _mm256_castsi256_ps(imm0); - poly_mask = _mm256_castsi256_ps(imm2); - - /* The magic pass: "Extended precision modular arithmetic" + { + + x = phase_rad_array; + + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); + + /* take the absolute value */ + x = _mm256_xor_ps(x, sign_bit_sin); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, _ps256_cephes_FOPI); + + /* we use SSE2 routines to perform the integer ops */ + + //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); + tmp256i = _mm256_cvttps_epi32(y); + imm2_1 = _mm256_extractf128_si256 (tmp256i, 0); + imm2_2 = _mm256_extractf128_si256 (tmp256i, 1); + + imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); + imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); + + imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); + imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); + + //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm2 = _mm256_set_m128i (imm2_2, imm2_1); + imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); + + y = _mm256_cvtepi32_ps(imm2); + + imm4_1 = imm2_1; + imm4_2 = imm2_2; + + imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); + imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); + + imm0_1 = _mm_slli_epi32(imm0_1, 29); + imm0_2 = _mm_slli_epi32(imm0_2, 29); + + //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm0 = _mm256_set_m128i (imm0_2, imm0_1); + imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); + + imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); + imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); + + imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); + imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); + + //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm2 = _mm256_set_m128i (imm2_2, imm2_1); + imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); + + swap_sign_bit_sin = _mm256_castsi256_ps(imm0); + poly_mask = _mm256_castsi256_ps(imm2); + + /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = _ps256_minus_cephes_DP1; - xmm2 = _ps256_minus_cephes_DP2; - xmm3 = _ps256_minus_cephes_DP3; - xmm1 = _mm256_mul_ps(y, xmm1); - xmm2 = _mm256_mul_ps(y, xmm2); - xmm3 = _mm256_mul_ps(y, xmm3); - x = _mm256_add_ps(x, xmm1); - x = _mm256_add_ps(x, xmm2); - x = _mm256_add_ps(x, xmm3); - - imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); - imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); - - imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); - imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); - - imm4_1 = _mm_slli_epi32(imm4_1, 29); - imm4_2 = _mm_slli_epi32(imm4_2, 29); - - //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); - //_mm256_set_m128i not defined in some versions of immintrin.h - //imm4 = _mm256_set_m128i (imm4_2, imm4_1); - imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); - - sign_bit_cos = _mm256_castsi256_ps(imm4); - - sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - z = _mm256_mul_ps(x,x); - y = _ps256_coscof_p0; - - y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, _ps256_coscof_p1); - y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, _ps256_coscof_p2); - y = _mm256_mul_ps(y, z); - y = _mm256_mul_ps(y, z); - tmp = _mm256_mul_ps(z, _ps256_0p5); - y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, _ps256_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - - y2 = _ps256_sincof_p0; - y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, _ps256_sincof_p1); - y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, _ps256_sincof_p2); - y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_mul_ps(y2, x); - y2 = _mm256_add_ps(y2, x); - - /* select the correct result from the two polynoms */ - xmm3 = poly_mask; - ysin2 = _mm256_and_ps(xmm3, y2); - ysin1 = _mm256_andnot_ps(xmm3, y); - y2 = _mm256_sub_ps(y2,ysin2); - y = _mm256_sub_ps(y, ysin1); - - xmm1 = _mm256_add_ps(ysin1,ysin2); - xmm2 = _mm256_add_ps(y,y2); - - /* update the sign */ - s = _mm256_xor_ps(xmm1, sign_bit_sin); - c = _mm256_xor_ps(xmm2, sign_bit_cos); - - //GNSS-SDR needs to return -sin - s = _mm256_xor_ps(s, _ps256_sign_mask); - - _mm256_store_ps ((float*)sin_value, s); - _mm256_store_ps ((float*)cos_value, c); - - for(int i = 0; i < 8; i++) - { - d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); + xmm1 = _ps256_minus_cephes_DP1; + xmm2 = _ps256_minus_cephes_DP2; + xmm3 = _ps256_minus_cephes_DP3; + xmm1 = _mm256_mul_ps(y, xmm1); + xmm2 = _mm256_mul_ps(y, xmm2); + xmm3 = _mm256_mul_ps(y, xmm3); + x = _mm256_add_ps(x, xmm1); + x = _mm256_add_ps(x, xmm2); + x = _mm256_add_ps(x, xmm3); + + imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); + imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); + + imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); + imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); + + imm4_1 = _mm_slli_epi32(imm4_1, 29); + imm4_2 = _mm_slli_epi32(imm4_2, 29); + + //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm4 = _mm256_set_m128i (imm4_2, imm4_1); + imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); + + sign_bit_cos = _mm256_castsi256_ps(imm4); + + sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + z = _mm256_mul_ps(x,x); + y = _ps256_coscof_p0; + + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, _ps256_coscof_p1); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, _ps256_coscof_p2); + y = _mm256_mul_ps(y, z); + y = _mm256_mul_ps(y, z); + tmp = _mm256_mul_ps(z, _ps256_0p5); + y = _mm256_sub_ps(y, tmp); + y = _mm256_add_ps(y, _ps256_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + + y2 = _ps256_sincof_p0; + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, _ps256_sincof_p1); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, _ps256_sincof_p2); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_mul_ps(y2, x); + y2 = _mm256_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + ysin2 = _mm256_and_ps(xmm3, y2); + ysin1 = _mm256_andnot_ps(xmm3, y); + y2 = _mm256_sub_ps(y2,ysin2); + y = _mm256_sub_ps(y, ysin1); + + xmm1 = _mm256_add_ps(ysin1,ysin2); + xmm2 = _mm256_add_ps(y,y2); + + /* update the sign */ + s = _mm256_xor_ps(xmm1, sign_bit_sin); + c = _mm256_xor_ps(xmm2, sign_bit_cos); + + //GNSS-SDR needs to return -sin + s = _mm256_xor_ps(s, _ps256_sign_mask); + + _mm256_store_ps ((float*)sin_value, s); + _mm256_store_ps ((float*)cos_value, c); + + for(int i = 0; i < 8; i++) + { + d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); + } + d_carr_sign += 8; + + phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); } - d_carr_sign += 8; - - phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); - } - + if (num_points%8!=0) - { - __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; - _mm256_store_ps ((float*)phase_rad_store, phase_rad_array); - - float phase_rad = phase_rad_store[0]; - - for(int i = 0; i < num_points%8; i++) { - *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); - d_carr_sign++; - phase_rad += phase_step_rad; + __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; + _mm256_store_ps ((float*)phase_rad_store, phase_rad_array); + + float phase_rad = phase_rad_store[0]; + + for(int i = 0; i < num_points%8; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } } - } } #endif /* LV_HAVE_AVX */ @@ -688,15 +687,15 @@ static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_ \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ - -// float* pointer1 = (float*)&phase_rad_init; -// *pointer1 = 0; -// float* pointer2 = (float*)&phase_step_rad; -// *pointer2 = 0.5; - +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points) +{ + // float* pointer1 = (float*)&phase_rad_init; + // *pointer1 = 0; + // float* pointer2 = (float*)&phase_step_rad; + // *pointer2 = 0.5; + const unsigned int sse_iters = num_points / 4; - + __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); @@ -714,128 +713,128 @@ static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); __m128 _ps_1 = _mm_set1_ps(1.f); __m128 _ps_0p5 = _mm_set1_ps(0.5f); - + __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); - + __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; __m128 xmm1, xmm2, xmm3, sign_bit_sin; __m128i emm0, emm2, emm4; __VOLK_ATTR_ALIGNED(16) float sin_value[4]; __VOLK_ATTR_ALIGNED(16) float cos_value[4]; - + phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); - + for(unsigned int i = 0; i < sse_iters; i++) - { - x = phase_rad_array; - - /* extract the sign bit (upper one) */ - sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); - - /* take the absolute value */ - x = _mm_xor_ps(x, sign_bit_sin); - - /* scale by 4/Pi */ - y = _mm_mul_ps(x, _ps_cephes_FOPI); - - /* store the integer part of y in emm2 */ - emm2 = _mm_cvttps_epi32(y); - - /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, _pi32_1); - emm2 = _mm_and_si128(emm2, _pi32_inv1); - y = _mm_cvtepi32_ps(emm2); - - emm4 = emm2; - - /* get the swap sign flag for the sine */ - emm0 = _mm_and_si128(emm2, _pi32_4); - emm0 = _mm_slli_epi32(emm0, 29); - swap_sign_bit_sin = _mm_castsi128_ps(emm0); - - /* get the polynom selection mask for the sine*/ - emm2 = _mm_and_si128(emm2, _pi32_2); - emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); - poly_mask = _mm_castsi128_ps(emm2); - - /* The magic pass: "Extended precision modular arithmetic" + { + x = phase_rad_array; + + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); + + /* take the absolute value */ + x = _mm_xor_ps(x, sign_bit_sin); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, _ps_cephes_FOPI); + + /* store the integer part of y in emm2 */ + emm2 = _mm_cvttps_epi32(y); + + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, _pi32_1); + emm2 = _mm_and_si128(emm2, _pi32_inv1); + y = _mm_cvtepi32_ps(emm2); + + emm4 = emm2; + + /* get the swap sign flag for the sine */ + emm0 = _mm_and_si128(emm2, _pi32_4); + emm0 = _mm_slli_epi32(emm0, 29); + swap_sign_bit_sin = _mm_castsi128_ps(emm0); + + /* get the polynom selection mask for the sine*/ + emm2 = _mm_and_si128(emm2, _pi32_2); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + poly_mask = _mm_castsi128_ps(emm2); + + /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); - xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); - xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); - x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); - - emm4 = _mm_sub_epi32(emm4, _pi32_2); - emm4 = _mm_andnot_si128(emm4, _pi32_4); - emm4 = _mm_slli_epi32(emm4, 29); - sign_bit_cos = _mm_castsi128_ps(emm4); - - sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); - - /* Evaluate the first polynom (0 <= x <= Pi/4) */ - z = _mm_mul_ps(x,x); - y = _ps_coscof_p0; - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, _ps_coscof_p1); - y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, _ps_coscof_p2); - y = _mm_mul_ps(y, _mm_mul_ps(z, z)); - tmp = _mm_mul_ps(z, _ps_0p5); - y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, _ps_1); - - /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - y2 = _ps_sincof_p0; - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, _ps_sincof_p1); - y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, _ps_sincof_p2); - y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); - y2 = _mm_add_ps(y2, x); - - /* select the correct result from the two polynoms */ - xmm3 = poly_mask; - ysin2 = _mm_and_ps(xmm3, y2); - ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); - y = _mm_sub_ps(y, ysin1); - - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); - - /* update the sign */ - s = _mm_xor_ps(xmm1, sign_bit_sin); - c = _mm_xor_ps(xmm2, sign_bit_cos); - - //GNSS-SDR needs to return -sin - s = _mm_xor_ps(s, _ps_sign_mask); - - _mm_store_ps ((float*)sin_value, s); - _mm_store_ps ((float*)cos_value, c); - - for(unsigned int e = 0; e < 4; e++) - { - d_carr_sign[e] = lv_cmake(cos_value[e], sin_value[e]); + xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); + xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); + xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); + x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); + + emm4 = _mm_sub_epi32(emm4, _pi32_2); + emm4 = _mm_andnot_si128(emm4, _pi32_4); + emm4 = _mm_slli_epi32(emm4, 29); + sign_bit_cos = _mm_castsi128_ps(emm4); + + sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + z = _mm_mul_ps(x,x); + y = _ps_coscof_p0; + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _ps_coscof_p1); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _ps_coscof_p2); + y = _mm_mul_ps(y, _mm_mul_ps(z, z)); + tmp = _mm_mul_ps(z, _ps_0p5); + y = _mm_sub_ps(y, tmp); + y = _mm_add_ps(y, _ps_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + y2 = _ps_sincof_p0; + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _ps_sincof_p1); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _ps_sincof_p2); + y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); + y2 = _mm_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + ysin2 = _mm_and_ps(xmm3, y2); + ysin1 = _mm_andnot_ps(xmm3, y); + y2 = _mm_sub_ps(y2,ysin2); + y = _mm_sub_ps(y, ysin1); + + xmm1 = _mm_add_ps(ysin1,ysin2); + xmm2 = _mm_add_ps(y,y2); + + /* update the sign */ + s = _mm_xor_ps(xmm1, sign_bit_sin); + c = _mm_xor_ps(xmm2, sign_bit_cos); + + //GNSS-SDR needs to return -sin + s = _mm_xor_ps(s, _ps_sign_mask); + + _mm_store_ps ((float*)sin_value, s); + _mm_store_ps ((float*)cos_value, c); + + for(unsigned int e = 0; e < 4; e++) + { + d_carr_sign[e] = lv_cmake(cos_value[e], sin_value[e]); + } + d_carr_sign += 4; + + phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); } - d_carr_sign += 4; - - phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); - } - - if (num_points%4!=0) - { - __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; - _mm_store_ps ((float*)phase_rad_store, phase_rad_array); - - float phase_rad = phase_rad_store[0]; - - for(unsigned int i = 0; i < num_points%4; i++) + + if (num_points % 4 != 0) { - *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); - d_carr_sign++; - phase_rad += phase_step_rad; + __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; + _mm_store_ps ((float*)phase_rad_store, phase_rad_array); + + float phase_rad = phase_rad_store[0]; + + for(unsigned int i = 0; i < num_points%4; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } } - } } #endif /* LV_HAVE_SSE2 */ @@ -846,20 +845,20 @@ static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc \param inputBuffer The buffer of data to be accumulated \param num_points The number of values in inputBuffer to be accumulated */ -static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points) +{ + // float* pointer1 = (float*)&phase_rad_init; + // *pointer1 = 0; + // float* pointer2 = (float*)&phase_step_rad; + // *pointer2 = 0.5; -// float* pointer1 = (float*)&phase_rad_init; -// *pointer1 = 0; -// float* pointer2 = (float*)&phase_step_rad; -// *pointer2 = 0.5; - float phase_rad = phase_rad_init; for(unsigned int i = 0; i < num_points; i++) - { - *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); - d_carr_sign++; - phase_rad += phase_step_rad; - } + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } } #endif /* LV_HAVE_GENERIC */ #endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H */