diff --git a/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc b/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc index c9ca8756c..417e6e1c2 100644 --- a/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc +++ b/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc @@ -105,12 +105,19 @@ int main(int argc, char *argv[]) { //GNSS-SDR PROTO-KERNELS //lv_32fc_t sfv = lv_cmake((float)1, (float)2); - //VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); + //example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); + + VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 32768.0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); + + VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); @@ -129,6 +136,9 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); if(store_results) { char path[1024]; volk_gnsssdr_get_config_path(path); diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h new file mode 100644 index 000000000..4fb24a125 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h @@ -0,0 +1,108 @@ +/*! + * \file CommonMacros.h + * \brief Common macros used inside the volk protokernels. + * \authors + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ +#ifndef INCLUDED_gnsssdr_CommonMacros_u_H +#define INCLUDED_gnsssdr_CommonMacros_u_H + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for U_SSE4_1 + */ + + #ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 + #define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\ + imag = _mm_srli_si128 (input1, 2);\ + imag = _mm_blend_epi16 (input2, imag, 85);\ + real = _mm_slli_si128 (input2, 2);\ + real = _mm_blend_epi16 (real, input1, 85); + #endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */ + + #ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 + #define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\ + input_i_1 = _mm_cvtepi16_epi32(input);\ + input = _mm_srli_si128 (input, 8);\ + input_i_2 = _mm_cvtepi16_epi32(input);\ + output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\ + output_ps = _mm_cvtepi32_ps(output_i32); + #endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_SSE2 + /*! + \brief Macros for U_SSE2 + */ + + #ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 + #define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ + realx_mult_realy = _mm_mullo_epi16 (realx, realy);\ + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\ + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\ + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\ + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\ + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + #endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */ + + #endif /* LV_HAVE_AVX */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for U_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */ + + +#ifndef INCLUDED_gnsssdr_CommonMacros_a_H +#define INCLUDED_gnsssdr_CommonMacros_a_H + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for A_SSE4_1 + */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_SSE2 + /*! + \brief Macros for U_SSE2 + */ + + #endif /* LV_HAVE_SSE2 */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for A_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h new file mode 100644 index 000000000..4fa054480 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h @@ -0,0 +1,76 @@ +/*! + * \file CommonMacros_16ic_cw_corr_32fc.h + * \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels. + * \authors + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ +#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H +#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H +#include "CommonMacros/CommonMacros.h" + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for U_SSE4_1 + */ + + #ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 + #define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\ + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for U_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */ + + +#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H +#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for A_SSE4_1 + */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for A_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/README.txt b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/README.txt new file mode 100644 index 000000000..3461e7851 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/README.txt @@ -0,0 +1,34 @@ +#################################################################### +Common Macros inside volk_gnsssdr module +#################################################################### + +First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties. +Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples. + +#################################################################### +Syntax +#################################################################### + +In order to allow better understanding of the code I created the macros with an specific syntax. + +1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example: + +example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + +First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output). +The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile. + +2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h + +#################################################################### +Workflow +#################################################################### + +In order to use the macros easily, I usually test the code without macros inside a test proto-kernel, where you are able to test it, debug it and use breakpoints. +When it works I place it inside a macro an test it again. + +#################################################################### +Why macros +#################################################################### +1) They are the only way I could find for sharing code between proto-kernels without performance penalty. +2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it. \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h new file mode 100644 index 000000000..95b0b093d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h @@ -0,0 +1,461 @@ +/*! + * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors + * \authors + * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the + * real part and 16 bits the imaginary part): + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 32 bits vectors) It returns the input + * signal in base band (BB) + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 32 bits vectors), accumulating the results + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 32 bits vectors), accumulating the results + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 32 bits vectors), accumulating the results + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" + /*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + //Adds the float 32 results + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_load_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_load_si128((__m128i*)input_ptr); + + y1 = _mm_load_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_load_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_load_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + //Adds the float 32 results + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_load_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_load_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h new file mode 100644 index 000000000..34d1fd715 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h @@ -0,0 +1,1568 @@ +/*! + * \file volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the + * real part and 16 bits the imaginary part): + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 32 bits vectors) It returns the input + * signal in base band (BB) + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 32 bits vectors), accumulating the results + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 32 bits vectors), accumulating the results + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 32 bits vectors), accumulating the results + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" + /*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_first(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 4; + + __m128i x, y, yaux, yl, yh, tmp1, tmp2, z, bb_signal_sample, bb_signal_sample_suffled; + + __m128 z_ps_1, z_ps_2, z_E, z_P, z_L; + __m128i z_i_1, z_i_2; + + lv_32fc_t dotProduct_E; + lv_32fc_t dotProduct_P; + lv_32fc_t dotProduct_L; + + z_E = _mm_setzero_ps(); + z_P = _mm_setzero_ps(); + z_L = _mm_setzero_ps(); + + const lv_16sc_t* _input = input; + const lv_16sc_t* _carrier = carrier; + const lv_16sc_t* _E_code = E_code; + const lv_16sc_t* _P_code = P_code; + const lv_16sc_t* _L_code = L_code; + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_lddqu_si128((__m128i*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + // Load yl with cr,cr,dr,dr + // Load yh with ci,ci,di,di + yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + yl = _mm_unpacklo_epi16(yaux, yaux); + yh = _mm_unpackhi_epi16(yaux, yaux); + + tmp1 = _mm_mullo_epi16(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_epi8 (x, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mullo_epi16(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); + bb_signal_sample = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_suffled = _mm_shuffle_epi8 (bb_signal_sample, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation E,P,L (3x vector scalar product) + // Early + y = _mm_lddqu_si128((__m128i*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + yl = _mm_unpacklo_epi16(yaux, yaux); + yh = _mm_unpackhi_epi16(yaux, yaux); + + tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); + z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_i_1 = _mm_cvtepi16_epi32(z); + z_ps_1 = _mm_cvtepi32_ps(z_i_1); + z = _mm_srli_si128 (z, 8); + z_i_2 = _mm_cvtepi16_epi32(z); + z_ps_2 = _mm_cvtepi32_ps(z_i_2); + + z_E = _mm_add_ps(z_E, z_ps_1); // Add the complex multiplication results together + z_E = _mm_add_ps(z_E, z_ps_2); // Add the complex multiplication results together + + // Prompt + y = _mm_lddqu_si128((__m128i*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + yl = _mm_unpacklo_epi16(yaux, yaux); + yh = _mm_unpackhi_epi16(yaux, yaux); + + tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); + z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_i_1 = _mm_cvtepi16_epi32(z); + z_ps_1 = _mm_cvtepi32_ps(z_i_1); + z = _mm_srli_si128 (z, 8); + z_i_2 = _mm_cvtepi16_epi32(z); + z_ps_2 = _mm_cvtepi32_ps(z_i_2); + + z_P = _mm_add_ps(z_P, z_ps_1); // Add the complex multiplication results together + z_P = _mm_add_ps(z_P, z_ps_2); // Add the complex multiplication results together + + // Late + y = _mm_lddqu_si128((__m128i*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + yl = _mm_unpacklo_epi16(yaux, yaux); + yh = _mm_unpackhi_epi16(yaux, yaux); + + tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); + z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_i_1 = _mm_cvtepi16_epi32(z); + z_ps_1 = _mm_cvtepi32_ps(z_i_1); + z = _mm_srli_si128 (z, 8); + z_i_2 = _mm_cvtepi16_epi32(z); + z_ps_2 = _mm_cvtepi32_ps(z_i_2); + + z_L = _mm_add_ps(z_L, z_ps_1); // Add the complex multiplication results together + z_L = _mm_add_ps(z_L, z_ps_2); // Add the complex multiplication results together + + _input += 4; + _carrier += 4; + _E_code += 4; + _L_code += 4; + _P_code += 4; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; + + _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); + } + + for(int i=0; i < num_points%4; ++i) + { + dotProduct_E += (lv_32fc_t)((*_input) * (*_E_code++)*(*_carrier)); + dotProduct_P += (lv_32fc_t)((*_input) * (*_P_code++)*(*_carrier)); + dotProduct_L += (lv_32fc_t)((*_input++) * (*_L_code++)*(*_carrier++)); + } + + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; + + + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_second(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; + __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x1, 2); + imagx = _mm_blend_epi16 (x2, imagx, 85); + realx = _mm_slli_si128 (x2, 2); + realx = _mm_blend_epi16 (realx, x1, 85); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + unsigned int index = 0; + unsigned int indexPlus4 = 0; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(index = 0;index < 8*sse_iters; index+=8){ + indexPlus4 = index + 4; + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]); + x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]); + + y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]); + + imagx = _mm_srli_si128 (x1, 2); + imagx = _mm_blend_epi16 (x2, imagx, 85); + realx = _mm_slli_si128 (x2, 2); + realx = _mm_blend_epi16 (realx, x1, 85); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(; index < num_points; index++) + { + //Perform the carrier wipe-off + bb_signal_sample = input_ptr[index] * carrier_ptr[index]; + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x1, 2); + imagx = _mm_blend_epi16 (x2, imagx, 85); + realx = _mm_slli_si128 (x2, 2); + realx = _mm_blend_epi16 (realx, x1, 85); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ + +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fifth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; + __m128i input_i_1, input_i_2, output_i32; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ + +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_sixth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; + __m128i input_i_1, input_i_2, output_i32; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H + +#include +#include +#include +#include +#include +// +//#ifdef LV_HAVE_SSE4_1 +//#include "smmintrin.h" +///*! +// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +// \param input The input signal input +// \param carrier The carrier signal input +// \param E_code Early PRN code replica input +// \param P_code Early PRN code replica input +// \param L_code Early PRN code replica input +// \param E_out Early correlation output +// \param P_out Early correlation output +// \param L_out Early correlation output +// \param num_points The number of complex values in vectors +// */ +//static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +//{ +// const unsigned int sse_iters = num_points / 8; +// +// __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; +// +// __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +// __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; +// __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; +// +// float E_out_real = 0; +// float E_out_imag = 0; +// float P_out_real = 0; +// float P_out_imag = 0; +// float L_out_real = 0; +// float L_out_imag = 0; +// +// const lv_16sc_t* input_ptr = input; +// const lv_16sc_t* carrier_ptr = carrier; +// +// const lv_16sc_t* E_code_ptr = E_code; +// lv_32fc_t* E_out_ptr = E_out; +// const lv_16sc_t* L_code_ptr = L_code; +// lv_32fc_t* L_out_ptr = L_out; +// const lv_16sc_t* P_code_ptr = P_code; +// lv_32fc_t* P_out_ptr = P_out; +// +// *E_out_ptr = 0; +// *P_out_ptr = 0; +// *L_out_ptr = 0; +// +// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +// +// real_E_code_acc = _mm_setzero_ps(); +// imag_E_code_acc = _mm_setzero_ps(); +// real_P_code_acc = _mm_setzero_ps(); +// imag_P_code_acc = _mm_setzero_ps(); +// real_L_code_acc = _mm_setzero_ps(); +// imag_L_code_acc = _mm_setzero_ps(); +// +// if (sse_iters>0) +// { +// for(int number = 0;number < sse_iters; number++){ +// +// //Perform the carrier wipe-off +// x1 = _mm_lddqu_si128((__m128i*)input_ptr); +// input_ptr += 4; +// x2 = _mm_lddqu_si128((__m128i*)input_ptr); +// +// y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); +// carrier_ptr += 4; +// y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); +// +// imagx = _mm_srli_si128 (x1, 2); +// imagx = _mm_blend_epi16 (x2, imagx, 85); +// realx = _mm_slli_si128 (x2, 2); +// realx = _mm_blend_epi16 (realx, x1, 85); +// +// imagy = _mm_srli_si128 (y1, 2); +// imagy = _mm_blend_epi16 (y2, imagy, 85); +// realy = _mm_slli_si128 (y2, 2); +// realy = _mm_blend_epi16 (realy, y1, 85); +// +// realx_mult_realy = _mm_mullo_epi16 (realx, realy); +// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +// +// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +// +// //Get early values +// y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); +// E_code_ptr += 4; +// y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); +// +// imagy = _mm_srli_si128 (y1, 2); +// imagy = _mm_blend_epi16 (y2, imagy, 85); +// realy = _mm_slli_si128 (y2, 2); +// realy = _mm_blend_epi16 (realy, y1, 85); +// +// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +// +// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +// +// real_output_i_1 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +// real_output = _mm_srli_si128 (real_output, 8); +// real_output_i_2 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +// +// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +// imag_output = _mm_srli_si128 (imag_output, 8); +// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +// +// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); +// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); +// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); +// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); +// +// //Get prompt values +// y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); +// P_code_ptr += 4; +// y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); +// +// imagy = _mm_srli_si128 (y1, 2); +// imagy = _mm_blend_epi16 (y2, imagy, 85); +// realy = _mm_slli_si128 (y2, 2); +// realy = _mm_blend_epi16 (realy, y1, 85); +// +// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +// +// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +// +// real_output_i_1 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +// real_output = _mm_srli_si128 (real_output, 8); +// real_output_i_2 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +// +// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +// imag_output = _mm_srli_si128 (imag_output, 8); +// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +// +// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); +// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); +// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); +// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); +// +// //Get late values +// y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); +// L_code_ptr += 4; +// y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); +// +// imagy = _mm_srli_si128 (y1, 2); +// imagy = _mm_blend_epi16 (y2, imagy, 85); +// realy = _mm_slli_si128 (y2, 2); +// realy = _mm_blend_epi16 (realy, y1, 85); +// +// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +// +// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +// +// real_output_i_1 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +// real_output = _mm_srli_si128 (real_output, 8); +// real_output_i_2 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +// +// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +// imag_output = _mm_srli_si128 (imag_output, 8); +// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +// +// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); +// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); +// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); +// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); +// +// input_ptr += 4; +// carrier_ptr += 4; +// E_code_ptr += 4; +// L_code_ptr += 4; +// P_code_ptr += 4; +// } +// +// __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +// +// _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +// +// for (int i = 0; i<4; ++i) +// { +// E_out_real += real_E_dotProductVector[i]; +// E_out_imag += imag_E_dotProductVector[i]; +// P_out_real += real_P_dotProductVector[i]; +// P_out_imag += imag_P_dotProductVector[i]; +// L_out_real += real_L_dotProductVector[i]; +// L_out_imag += imag_L_dotProductVector[i]; +// } +// *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +// *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +// *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +// } +// +// lv_16sc_t bb_signal_sample; +// for(int i=0; i < num_points%8; ++i) +// { +// //Perform the carrier wipe-off +// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +// // Now get early, late, and prompt values for each +// *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +// *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +// *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +// } +//} +//#endif /* LV_HAVE_SSE4_1 */ +// +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h new file mode 100644 index 000000000..af207b92f --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h @@ -0,0 +1,595 @@ +/*! + * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the + * real part and 16 bits the imaginary part) and accumulates into float32 values, returning them: + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 32 bits vectors) It returns the input + * signal in base band (BB) + * - Very Early values are calculated by multiplying the input signal in BB by the + * very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * - Very Late values are calculated by multiplying the input signal in BB by the + * very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" + /*! + \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_16sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr); + VE_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr); + VL_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + VE_code_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + VL_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + lv_16sc_t tmp4; + lv_16sc_t tmp5; + + bb_signal_sample = lv_cmake(0, 0); + + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * VE_code[i]; + tmp2 = bb_signal_sample * E_code[i]; + tmp3 = bb_signal_sample * P_code[i]; + tmp4 = bb_signal_sample * L_code[i]; + tmp5 = bb_signal_sample * VL_code[i]; + + // Now get early, late, and prompt values for each + *VE_out += (lv_32fc_t)tmp1; + *E_out += (lv_32fc_t)tmp2; + *P_out += (lv_32fc_t)tmp3; + *L_out += (lv_32fc_t)tmp4; + *VL_out += (lv_32fc_t)tmp5; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_16sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_load_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_load_si128((__m128i*)input_ptr); + + y1 = _mm_load_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_load_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y1 = _mm_load_si128((__m128i*)VE_code_ptr); + VE_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y1 = _mm_load_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_load_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_load_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y1 = _mm_load_si128((__m128i*)VL_code_ptr); + VL_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + VE_code_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + VL_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + lv_16sc_t tmp4; + lv_16sc_t tmp5; + + bb_signal_sample = lv_cmake(0, 0); + + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * VE_code[i]; + tmp2 = bb_signal_sample * E_code[i]; + tmp3 = bb_signal_sample * P_code[i]; + tmp4 = bb_signal_sample * L_code[i]; + tmp5 = bb_signal_sample * VL_code[i]; + + // Now get early, late, and prompt values for each + *VE_out += (lv_32fc_t)tmp1; + *E_out += (lv_32fc_t)tmp2; + *P_out += (lv_32fc_t)tmp3; + *L_out += (lv_32fc_t)tmp4; + *VL_out += (lv_32fc_t)tmp5; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h new file mode 100644 index 000000000..cd83cef9d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h @@ -0,0 +1,302 @@ +#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H +#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */ +#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H +#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r < min_val) + r = min_val; + else if(r > max_val) + r = max_val; + *outputVectorPtr++ = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h new file mode 100644 index 000000000..74e5e736e --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h @@ -0,0 +1,297 @@ +/*! + * \file volk_gnsssdr_32fc_convert_16ic.h + * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that implements an accumulator of char values + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H +#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/4; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + + float min_val = -32768; + float max_val = 32767; + + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/4; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + + float min_val = -32768; + float max_val = 32767; + + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + float min_val = -32768; + float max_val = 32767; + + for(unsigned int i = 0; i < num_points*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H +#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/4; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + + float min_val = -32768; + float max_val = 32767; + + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/4; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + + float min_val = -32768; + float max_val = 32767; + + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + float min_val = -32768; + float max_val = 32767; + + for(unsigned int i = 0; i < num_points*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h index 378cea204..7b66b6491 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h @@ -57,13 +57,180 @@ #include #include +#ifdef LV_HAVE_AVX +#include /*! - * TODO: Code the SSE4 version and benchmark it + \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code VE PRN code replica input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param VL_code VL PRN code replica input + \param VE_out VE correlation output + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param VL_out VL correlation output + \param num_points The number of complex values in vectors */ +static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 4; + + lv_32fc_t dotProduct_VE; + lv_32fc_t dotProduct_E; + lv_32fc_t dotProduct_P; + lv_32fc_t dotProduct_L; + lv_32fc_t dotProduct_VL; + + // Aux vars + __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; + __m256 bb_signal_sample, bb_signal_sample_shuffled; + + z_VE = _mm256_setzero_ps(); + z_E = _mm256_setzero_ps(); + z_P = _mm256_setzero_ps(); + z_L = _mm256_setzero_ps(); + z_VL = _mm256_setzero_ps(); + + //input and output vectors + const lv_32fc_t* _input = input; + const lv_32fc_t* _carrier = carrier; + const lv_32fc_t* _VE_code = VE_code; + const lv_32fc_t* _E_code = E_code; + const lv_32fc_t* _P_code = P_code; + const lv_32fc_t* _L_code = L_code; + const lv_32fc_t* _VL_code = VL_code; + + for(;number < halfPoints; number++) + { + // carrier wipe-off (vector point-to-point product) + x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together + + // Early + y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 4; + _input += 4; + _VE_code += 4; + _E_code += 4; + _P_code += 4; + _L_code += 4; + _VL_code += 4; + } + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; + + _mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector + + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); + dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); + + for (int i = 0; i<(num_points % 4); ++i) + { + dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); + dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); + dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); + dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); + } + + *VE_out = dotProduct_VE; + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; + *VL_out = dotProduct_VL; +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include - - /*! \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation \param input The input signal input @@ -86,15 +253,10 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V const unsigned int halfPoints = num_points / 2; lv_32fc_t dotProduct_VE; - memset(&dotProduct_VE, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_E; - memset(&dotProduct_E, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_P; - memset(&dotProduct_P, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_L; - memset(&dotProduct_L, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_VL; - memset(&dotProduct_VL, 0x0, 2*sizeof(float)); // Aux vars __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; @@ -217,11 +379,11 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector _mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector - dotProduct_VE += ( dotProductVector_VE[0] + dotProductVector_VE[1] ); - dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); - dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); - dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); - dotProduct_VL += ( dotProductVector_VL[0] + dotProductVector_VL[1] ); + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); + dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); if((num_points % 2) != 0) { @@ -238,7 +400,6 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V *L_out = dotProduct_L; *VL_out = dotProduct_VL; } - #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_GENERIC @@ -297,6 +458,178 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* #include #include +#ifdef LV_HAVE_AVX +#include +/*! + \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code VE PRN code replica input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param VL_code VL PRN code replica input + \param VE_out VE correlation output + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param VL_out VL correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 4; + + lv_32fc_t dotProduct_VE; + lv_32fc_t dotProduct_E; + lv_32fc_t dotProduct_P; + lv_32fc_t dotProduct_L; + lv_32fc_t dotProduct_VL; + + // Aux vars + __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; + __m256 bb_signal_sample, bb_signal_sample_shuffled; + + z_VE = _mm256_setzero_ps(); + z_E = _mm256_setzero_ps(); + z_P = _mm256_setzero_ps(); + z_L = _mm256_setzero_ps(); + z_VL = _mm256_setzero_ps(); + + //input and output vectors + const lv_32fc_t* _input = input; + const lv_32fc_t* _carrier = carrier; + const lv_32fc_t* _VE_code = VE_code; + const lv_32fc_t* _E_code = E_code; + const lv_32fc_t* _P_code = P_code; + const lv_32fc_t* _L_code = L_code; + const lv_32fc_t* _VL_code = VL_code; + + for(;number < halfPoints; number++) + { + // carrier wipe-off (vector point-to-point product) + x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together + + // Early + y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 4; + _input += 4; + _VE_code += 4; + _E_code += 4; + _P_code += 4; + _L_code += 4; + _VL_code += 4; + } + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; + + _mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector + + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); + dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); + + for (int i = 0; i<(num_points % 4); ++i) + { + dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); + dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); + dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); + dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); + } + + *VE_out = dotProduct_VE; + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; + *VL_out = dotProduct_VL; +} +#endif /* LV_HAVE_AVX */ + #ifdef LV_HAVE_SSE3 #include /*! @@ -321,15 +654,10 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V const unsigned int halfPoints = num_points / 2; lv_32fc_t dotProduct_VE; - memset(&dotProduct_VE, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_E; - memset(&dotProduct_E, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_P; - memset(&dotProduct_P, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_L; - memset(&dotProduct_L, 0x0, 2*sizeof(float)); lv_32fc_t dotProduct_VL; - memset(&dotProduct_VL, 0x0, 2*sizeof(float)); // Aux vars __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; @@ -452,11 +780,11 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector - dotProduct_VE += ( dotProductVector_VE[0] + dotProductVector_VE[1] ); - dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); - dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); - dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); - dotProduct_VL += ( dotProductVector_VL[0] + dotProductVector_VL[1] ); + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); + dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); if((num_points % 2) != 0) { @@ -472,9 +800,7 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V *P_out = dotProduct_P; *L_out = dotProduct_L; *VL_out = dotProduct_VL; - } - #endif /* LV_HAVE_SSE3 */ #ifdef LV_HAVE_GENERIC @@ -518,7 +844,5 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t *VL_out += bb_signal_sample * VL_code[i]; } } - #endif /* LV_HAVE_GENERIC */ - #endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h new file mode 100644 index 000000000..6acb25390 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h @@ -0,0 +1,243 @@ +/*! + * \file volk_gnsssdr_64f_accumulator_64f.h + * \brief Volk protokernel: 64 bits (double) scalar accumulator + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that implements an accumulator of char values + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H +#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){ + double returnValue = 0; + const unsigned int sse_iters = num_points / 4; + + const double* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; + __m256d accumulator = _mm256_setzero_pd(); + __m256d aVal = _mm256_setzero_pd(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm256_loadu_pd(aPtr); + accumulator = _mm256_add_pd(accumulator, aVal); + aPtr += 4; + } + + _mm256_storeu_pd((double*)tempBuffer,accumulator); + + for(int i = 0; i<4; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 4); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){ + double returnValue = 0; + const unsigned int sse_iters = num_points / 2; + + const double* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; + __m128d accumulator = _mm_setzero_pd(); + __m128d aVal = _mm_setzero_pd(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm_loadu_pd(aPtr); + accumulator = _mm_add_pd(accumulator, aVal); + aPtr += 2; + } + + _mm_storeu_pd((double*)tempBuffer,accumulator); + + for(int i = 0; i<2; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 2); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){ + const double* aPtr = inputBuffer; + double returnValue = 0; + + for(unsigned int number = 0;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H +#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){ + double returnValue = 0; + const unsigned int sse_iters = num_points / 4; + + const double* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; + __m256d accumulator = _mm256_setzero_pd(); + __m256d aVal = _mm256_setzero_pd(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm256_load_pd(aPtr); + accumulator = _mm256_add_pd(accumulator, aVal); + aPtr += 4; + } + + _mm256_store_pd((double*)tempBuffer,accumulator); + + for(int i = 0; i<4; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 4); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){ + double returnValue = 0; + const unsigned int sse_iters = num_points / 2; + + const double* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; + __m128d accumulator = _mm_setzero_pd(); + __m128d aVal = _mm_setzero_pd(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm_load_pd(aPtr); + accumulator = _mm_add_pd(accumulator, aVal); + aPtr += 2; + } + + _mm_store_pd((double*)tempBuffer,accumulator); + + for(int i = 0; i<2; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 2); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){ + const double* aPtr = inputBuffer; + double returnValue = 0; + + for(unsigned int number = 0;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */ \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h index fcbebdde3..61579e2bb 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h @@ -1,22 +1,23 @@ /*! * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h - * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32. * \authors
    *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com *
* - * Volk protokernel that performs the carrier wipe-off mixing and the - * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the - * real part and 8 bits the imaginary part): - * - The carrier wipe-off is done by multiplying the input signal by the - * carrier (multiplication of 16 bits vectors) It returns the input + * Volk protokernel that performs the carrier wipe-off mixing and the + * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the + * real part and 8 bits the imaginary part), and accumulates the result + * in 32 bits single point values, returning float32 values: + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 16 bits vectors) It returns the input * signal in base band (BB) * - Early values are calculated by multiplying the input signal in BB by the - * early code (multiplication of 16 bits vectors), accumulating the results + * early code (multiplication of 16 bits vectors), accumulating the results into float32 values * - Prompt values are calculated by multiplying the input signal in BB by the - * prompt code (multiplication of 16 bits vectors), accumulating the results + * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values * - Late values are calculated by multiplying the input signal in BB by the - * late code (multiplication of 16 bits vectors), accumulating the results + * late code (multiplication of 16 bits vectors), accumulating the results into float32 values * * ------------------------------------------------------------------------- * @@ -54,34 +55,38 @@ #ifdef LV_HAVE_SSE4_1 #include "smmintrin.h" - /*! - \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation - \param input The input signal input - \param carrier The carrier signal input - \param E_code Early PRN code replica input - \param P_code Early PRN code replica input - \param L_code Early PRN code replica input - \param E_out Early correlation output - \param P_out Early correlation output - \param L_out Early correlation output - \param num_points The number of complex values in vectors - */ +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { const unsigned int sse_iters = num_points / 8; - __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; - + + __m128 E_code_acc, P_code_acc, L_code_acc; + __m128i output_i_1, output_i_2, output_i_3, output_i_4; + __m128 output_ps_1, output_ps_2, output_ps_3, output_ps_4; + const lv_8sc_t* input_ptr = input; const lv_8sc_t* carrier_ptr = carrier; const lv_8sc_t* E_code_ptr = E_code; - lv_8sc_t* E_out_ptr = E_out; + lv_32fc_t* E_out_ptr = E_out; const lv_8sc_t* L_code_ptr = L_code; - lv_8sc_t* L_out_ptr = L_out; + lv_32fc_t* L_out_ptr = L_out; const lv_8sc_t* P_code_ptr = P_code; - lv_8sc_t* P_out_ptr = P_out; + lv_32fc_t* P_out_ptr = P_out; *E_out_ptr = 0; *P_out_ptr = 0; @@ -89,13 +94,10 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); - real_E_code_acc = _mm_setzero_si128(); - imag_E_code_acc = _mm_setzero_si128(); - real_L_code_acc = _mm_setzero_si128(); - imag_L_code_acc = _mm_setzero_si128(); - real_P_code_acc = _mm_setzero_si128(); - imag_P_code_acc = _mm_setzero_si128(); - + E_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + if (sse_iters>0) { for(int number = 0;number < sse_iters; number++){ @@ -135,26 +137,25 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); - imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); - //Get late values - y = _mm_lddqu_si128((__m128i*)L_code_ptr); + output_i_1 = _mm_cvtepi8_epi32(output); + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output = _mm_srli_si128 (output, 4); + output_i_2 = _mm_cvtepi8_epi32(output); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output = _mm_srli_si128 (output, 4); + output_i_3 = _mm_cvtepi8_epi32(output); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output = _mm_srli_si128 (output, 4); + output_i_4 = _mm_cvtepi8_epi32(output); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); - imagy = _mm_srli_si128 (y, 1); - imagy = _mm_and_si128 (imagy, mult1); - realy = _mm_and_si128 (y, mult1); - - realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); - imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); - realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); - imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); - - real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); - imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - - real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); - imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_3); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_4); //Get prompt values y = _mm_lddqu_si128((__m128i*)P_code_ptr); @@ -171,37 +172,81 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); - real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); - imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + output_i_1 = _mm_cvtepi8_epi32(output); + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output = _mm_srli_si128 (output, 4); + output_i_2 = _mm_cvtepi8_epi32(output); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output = _mm_srli_si128 (output, 4); + output_i_3 = _mm_cvtepi8_epi32(output); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output = _mm_srli_si128 (output, 4); + output_i_4 = _mm_cvtepi8_epi32(output); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_3); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_4); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + output_i_1 = _mm_cvtepi8_epi32(output); + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output = _mm_srli_si128 (output, 4); + output_i_2 = _mm_cvtepi8_epi32(output); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output = _mm_srli_si128 (output, 4); + output_i_3 = _mm_cvtepi8_epi32(output); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output = _mm_srli_si128 (output, 4); + output_i_4 = _mm_cvtepi8_epi32(output); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_3); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_4); input_ptr += 8; carrier_ptr += 8; E_code_ptr += 8; - L_code_ptr += 8; P_code_ptr += 8; + L_code_ptr += 8; } - __VOLK_ATTR_ALIGNED(16) lv_16sc_t E_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t L_dotProductVector[8]; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t P_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; - imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); - output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); - _mm_storeu_si128((__m128i*)E_dotProductVector, output); + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector - imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); - output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); - _mm_storeu_si128((__m128i*)L_dotProductVector, output); - - imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); - output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); - _mm_storeu_si128((__m128i*)P_dotProductVector, output); - - for (int i = 0; i<8; ++i) + for (int i = 0; i<2; ++i) { *E_out_ptr += E_dotProductVector[i]; - *L_out_ptr += L_dotProductVector[i]; *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; } } @@ -211,186 +256,244 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E //Perform the carrier wipe-off bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); // Now get early, late, and prompt values for each - *E_out_ptr += bb_signal_sample * (*E_code_ptr++); - *P_out_ptr += bb_signal_sample * (*P_code_ptr++); - *L_out_ptr += bb_signal_sample * (*L_code_ptr++); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); } } - #endif /* LV_HAVE_SSE4_1 */ -//#ifdef LV_HAVE_SSE2 -//#include "emmintrin.h" -///*! -// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation -// \param input The input signal input -// \param carrier The carrier signal input -// \param E_code Early PRN code replica input -// \param P_code Early PRN code replica input -// \param L_code Early PRN code replica input -// \param E_out Early correlation output -// \param P_out Early correlation output -// \param L_out Early correlation output -// \param num_points The number of complex values in vectors -// */ -//static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) -//{ -// const unsigned int sse_iters = num_points / 8; -// -// __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; -// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; -// -// const lv_8sc_t* input_ptr = input; -// const lv_8sc_t* carrier_ptr = carrier; -// -// const lv_8sc_t* E_code_ptr = E_code; -// lv_8sc_t* E_out_ptr = E_out; -// const lv_8sc_t* L_code_ptr = L_code; -// lv_8sc_t* L_out_ptr = L_out; -// const lv_8sc_t* P_code_ptr = P_code; -// lv_8sc_t* P_out_ptr = P_out; -// -// *E_out_ptr = 0; -// *P_out_ptr = 0; -// *L_out_ptr = 0; -// -// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); -// -// real_E_code_acc = _mm_setzero_si128(); -// imag_E_code_acc = _mm_setzero_si128(); -// real_L_code_acc = _mm_setzero_si128(); -// imag_L_code_acc = _mm_setzero_si128(); -// real_P_code_acc = _mm_setzero_si128(); -// imag_P_code_acc = _mm_setzero_si128(); -// -// if (sse_iters>0) -// { -// for(int number = 0;number < sse_iters; number++){ -// -// //Perform the carrier wipe-off -// x = _mm_lddqu_si128((__m128i*)input_ptr); -// y = _mm_lddqu_si128((__m128i*)carrier_ptr); -// -// imagx = _mm_srli_si128 (x, 1); -// imagx = _mm_and_si128 (imagx, mult1); -// realx = _mm_and_si128 (x, mult1); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (realx, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); -// -// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// //Get early values -// y = _mm_lddqu_si128((__m128i*)E_code_ptr); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); -// -// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); -// imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); -// -// //Get late values -// y = _mm_lddqu_si128((__m128i*)L_code_ptr); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); -// -// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); -// imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); -// -// //Get prompt values -// y = _mm_lddqu_si128((__m128i*)P_code_ptr); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); -// -// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); -// imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); -// -// input_ptr += 8; -// carrier_ptr += 8; -// E_code_ptr += 8; -// L_code_ptr += 8; -// P_code_ptr += 8; -// } -// -// __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; -// __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; -// __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; -// -// real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); -// imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); -// imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); -// output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); -// _mm_storeu_si128((__m128i*)E_dotProductVector, output); -// -// real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); -// imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); -// imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); -// output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); -// _mm_storeu_si128((__m128i*)L_dotProductVector, output); -// -// real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); -// imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); -// imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); -// output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); -// _mm_storeu_si128((__m128i*)P_dotProductVector, output); -// -// for (int i = 0; i<8; ++i) -// { -// *E_out_ptr += E_dotProductVector[i]; -// *L_out_ptr += L_dotProductVector[i]; -// *P_out_ptr += P_dotProductVector[i]; -// } -// } -// -// lv_8sc_t bb_signal_sample; -// for(int i=0; i < num_points%8; ++i) -// { -// //Perform the carrier wipe-off -// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); -// // Now get early, late, and prompt values for each -// *E_out_ptr += bb_signal_sample * (*E_code_ptr++); -// *P_out_ptr += bb_signal_sample * (*P_code_ptr++); -// *L_out_ptr += bb_signal_sample * (*L_code_ptr++); -// } -//} -// -//#endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 E_code_acc, P_code_acc, L_code_acc; + __m128i output_i_aux_1, output_i_aux_2, output_i_1, output_i_2, output_i_3, output_i_4; + __m128 output_ps_1, output_ps_2, output_ps_3, output_ps_4; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + E_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output = _mm_and_si128 (real_output, mult1); + imag_output = _mm_and_si128 (imag_output, mult1); + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_or_si128 (real_output, imag_output); + + output_i_aux_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), output); + output_i_aux_2 = _mm_unpackhi_epi8(_mm_setzero_si128(), output); + output_i_1 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_2 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_3 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_2); + output_i_4 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_2); + + //Shift, MAINTAINING THE SIGN!!! + output_i_1 = _mm_srai_epi32(output_i_1, 24); + output_i_2 = _mm_srai_epi32(output_i_2, 24); + output_i_3 = _mm_srai_epi32(output_i_3, 24); + output_i_4 = _mm_srai_epi32(output_i_4, 24); + + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_3); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_4); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output = _mm_and_si128 (real_output, mult1); + imag_output = _mm_and_si128 (imag_output, mult1); + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_or_si128 (real_output, imag_output); + + output_i_aux_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), output); + output_i_aux_2 = _mm_unpackhi_epi8(_mm_setzero_si128(), output); + output_i_1 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_2 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_3 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_2); + output_i_4 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_2); + + output_i_1 = _mm_srai_epi32(output_i_1, 24); + output_i_2 = _mm_srai_epi32(output_i_2, 24); + output_i_3 = _mm_srai_epi32(output_i_3, 24); + output_i_4 = _mm_srai_epi32(output_i_4, 24); + + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_3); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_4); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output = _mm_and_si128 (real_output, mult1); + imag_output = _mm_and_si128 (imag_output, mult1); + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_or_si128 (real_output, imag_output); + + output_i_aux_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), output); + output_i_aux_2 = _mm_unpackhi_epi8(_mm_setzero_si128(), output); + output_i_1 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_2 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_3 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_2); + output_i_4 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_2); + + output_i_1 = _mm_srai_epi32(output_i_1, 24); + output_i_2 = _mm_srai_epi32(output_i_2, 24); + output_i_3 = _mm_srai_epi32(output_i_3, 24); + output_i_4 = _mm_srai_epi32(output_i_4, 24); + + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_3); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_4); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_GENERIC /*! @@ -408,9 +511,6 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) { lv_8sc_t bb_signal_sample; - lv_16sc_t tmp1; - lv_16sc_t tmp2; - lv_16sc_t tmp3; bb_signal_sample = lv_cmake(0, 0); @@ -422,15 +522,10 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_ { //Perform the carrier wipe-off bb_signal_sample = input[i] * carrier[i]; - - tmp1 = bb_signal_sample * E_code[i]; - tmp2 = bb_signal_sample * P_code[i]; - tmp3 = bb_signal_sample * L_code[i]; - // Now get early, late, and prompt values for each - *E_out += tmp1; - *P_out += tmp2; - *L_out += tmp3; + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); } } @@ -439,444 +534,489 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H */ -//#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H -//#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H -// -//#include -//#include -//#include -//#include -//#include -// -//#ifdef LV_HAVE_SSE4_1 -//#include "smmintrin.h" -///*! -// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation -// \param input The input signal input -// \param carrier The carrier signal input -// \param E_code Early PRN code replica input -// \param P_code Early PRN code replica input -// \param L_code Early PRN code replica input -// \param E_out Early correlation output -// \param P_out Early correlation output -// \param L_out Early correlation output -// \param num_points The number of complex values in vectors -// */ -//static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) -//{ -// const unsigned int sse_iters = num_points / 8; -// -// __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; -// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; -// -// const lv_8sc_t* input_ptr = input; -// const lv_8sc_t* carrier_ptr = carrier; -// -// const lv_8sc_t* E_code_ptr = E_code; -// lv_8sc_t* E_out_ptr = E_out; -// const lv_8sc_t* L_code_ptr = L_code; -// lv_8sc_t* L_out_ptr = L_out; -// const lv_8sc_t* P_code_ptr = P_code; -// lv_8sc_t* P_out_ptr = P_out; -// -// *E_out_ptr = 0; -// *P_out_ptr = 0; -// *L_out_ptr = 0; -// -// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); -// -// real_E_code_acc = _mm_setzero_si128(); -// imag_E_code_acc = _mm_setzero_si128(); -// real_L_code_acc = _mm_setzero_si128(); -// imag_L_code_acc = _mm_setzero_si128(); -// real_P_code_acc = _mm_setzero_si128(); -// imag_P_code_acc = _mm_setzero_si128(); -// -// if (sse_iters>0) -// { -// for(int number = 0;number < sse_iters; number++){ -// -// //Perform the carrier wipe-off -// x = _mm_load_si128((__m128i*)input_ptr); -// y = _mm_load_si128((__m128i*)carrier_ptr); -// -// imagx = _mm_srli_si128 (x, 1); -// imagx = _mm_and_si128 (imagx, mult1); -// realx = _mm_and_si128 (x, mult1); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (realx, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); -// -// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// //Get early values -// y = _mm_load_si128((__m128i*)E_code_ptr); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); -// -// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); -// imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); -// -// //Get late values -// y = _mm_load_si128((__m128i*)L_code_ptr); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); -// -// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); -// imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); -// -// //Get prompt values -// y = _mm_load_si128((__m128i*)P_code_ptr); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); -// -// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); -// imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); -// -// input_ptr += 8; -// carrier_ptr += 8; -// E_code_ptr += 8; -// L_code_ptr += 8; -// P_code_ptr += 8; -// } -// -// __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; -// __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; -// __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; -// -// imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); -// output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); -// _mm_store_si128((__m128i*)E_dotProductVector, output); -// -// imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); -// output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); -// _mm_store_si128((__m128i*)L_dotProductVector, output); -// -// imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); -// output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); -// _mm_store_si128((__m128i*)P_dotProductVector, output); -// -// for (int i = 0; i<8; ++i) -// { -// *E_out_ptr += E_dotProductVector[i]; -// *L_out_ptr += L_dotProductVector[i]; -// *P_out_ptr += P_dotProductVector[i]; -// } -// } -// -// lv_8sc_t bb_signal_sample; -// for(int i=0; i < num_points%8; ++i) -// { -// //Perform the carrier wipe-off -// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); -// // Now get early, late, and prompt values for each -// *E_out_ptr += bb_signal_sample * (*E_code_ptr++); -// *P_out_ptr += bb_signal_sample * (*P_code_ptr++); -// *L_out_ptr += bb_signal_sample * (*L_code_ptr++); -// } -//} -// -//#endif /* LV_HAVE_SSE4_1 */ -// -//#ifdef LV_HAVE_SSE2 -//#include "emmintrin.h" -///*! -// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation -// \param input The input signal input -// \param carrier The carrier signal input -// \param E_code Early PRN code replica input -// \param P_code Early PRN code replica input -// \param L_code Early PRN code replica input -// \param E_out Early correlation output -// \param P_out Early correlation output -// \param L_out Early correlation output -// \param num_points The number of complex values in vectors -// */ -//static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) -//{ -// const unsigned int sse_iters = num_points / 8; -// -// __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; -// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; -// -// const lv_8sc_t* input_ptr = input; -// const lv_8sc_t* carrier_ptr = carrier; -// -// const lv_8sc_t* E_code_ptr = E_code; -// lv_8sc_t* E_out_ptr = E_out; -// const lv_8sc_t* L_code_ptr = L_code; -// lv_8sc_t* L_out_ptr = L_out; -// const lv_8sc_t* P_code_ptr = P_code; -// lv_8sc_t* P_out_ptr = P_out; -// -// *E_out_ptr = 0; -// *P_out_ptr = 0; -// *L_out_ptr = 0; -// -// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); -// -// real_E_code_acc = _mm_setzero_si128(); -// imag_E_code_acc = _mm_setzero_si128(); -// real_L_code_acc = _mm_setzero_si128(); -// imag_L_code_acc = _mm_setzero_si128(); -// real_P_code_acc = _mm_setzero_si128(); -// imag_P_code_acc = _mm_setzero_si128(); -// -// if (sse_iters>0) -// { -// for(int number = 0;number < sse_iters; number++){ -// -// //Perform the carrier wipe-off -// x = _mm_load_si128((__m128i*)input_ptr); -// y = _mm_load_si128((__m128i*)carrier_ptr); -// -// imagx = _mm_srli_si128 (x, 1); -// imagx = _mm_and_si128 (imagx, mult1); -// realx = _mm_and_si128 (x, mult1); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (realx, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); -// -// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// //Get early values -// y = _mm_load_si128((__m128i*)E_code_ptr); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); -// -// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); -// imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); -// -// //Get late values -// y = _mm_load_si128((__m128i*)L_code_ptr); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); -// -// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); -// imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); -// -// //Get prompt values -// y = _mm_load_si128((__m128i*)P_code_ptr); -// -// imagy = _mm_srli_si128 (y, 1); -// imagy = _mm_and_si128 (imagy, mult1); -// realy = _mm_and_si128 (y, mult1); -// -// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); -// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); -// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); -// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); -// -// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); -// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); -// -// real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); -// imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); -// -// input_ptr += 8; -// carrier_ptr += 8; -// E_code_ptr += 8; -// L_code_ptr += 8; -// P_code_ptr += 8; -// } -// -// __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; -// __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; -// __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; -// -// real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); -// imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); -// imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); -// output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); -// _mm_store_si128((__m128i*)E_dotProductVector, output); -// -// real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); -// imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); -// imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); -// output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); -// _mm_store_si128((__m128i*)L_dotProductVector, output); -// -// real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); -// imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); -// imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); -// output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); -// _mm_store_si128((__m128i*)P_dotProductVector, output); -// -// for (int i = 0; i<8; ++i) -// { -// *E_out_ptr += E_dotProductVector[i]; -// *L_out_ptr += L_dotProductVector[i]; -// *P_out_ptr += P_dotProductVector[i]; -// } -// } -// -// lv_8sc_t bb_signal_sample; -// for(int i=0; i < num_points%8; ++i) -// { -// //Perform the carrier wipe-off -// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); -// // Now get early, late, and prompt values for each -// *E_out_ptr += bb_signal_sample * (*E_code_ptr++); -// *P_out_ptr += bb_signal_sample * (*P_code_ptr++); -// *L_out_ptr += bb_signal_sample * (*L_code_ptr++); -// } -//} -// -//#endif /* LV_HAVE_SSE2 */ -// -//#ifdef LV_HAVE_GENERIC -///*! -// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation -// \param input The input signal input -// \param carrier The carrier signal input -// \param E_code Early PRN code replica input -// \param P_code Early PRN code replica input -// \param L_code Early PRN code replica input -// \param E_out Early correlation output -// \param P_out Early correlation output -// \param L_out Early correlation output -// \param num_points The number of complex values in vectors -// */ -//static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) -//{ -// lv_8sc_t bb_signal_sample; -// -// bb_signal_sample = lv_cmake(0, 0); -// -// *E_out = 0; -// *P_out = 0; -// *L_out = 0; -// // perform Early, Prompt and Late correlation -// for(int i=0; i < num_points; ++i) -// { -// //Perform the carrier wipe-off -// bb_signal_sample = input[i] * carrier[i]; -// // Now get early, late, and prompt values for each -// *E_out += bb_signal_sample * E_code[i]; -// *P_out += bb_signal_sample * P_code[i]; -// *L_out += bb_signal_sample * L_code[i]; -// } -//} -// -//#endif /* LV_HAVE_GENERIC */ -// -//#ifdef LV_HAVE_ORC -///*! -// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation -// \param input The input signal input -// \param carrier The carrier signal input -// \param E_code Early PRN code replica input -// \param P_code Early PRN code replica input -// \param L_code Early PRN code replica input -// \param E_out Early correlation output -// \param P_out Early correlation output -// \param L_out Early correlation output -// \param num_points The number of complex values in vectors -// */ -// -//extern void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points); -//extern void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points); -//static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){ -// -// short E_out_real = 0; -// short E_out_imag = 0; -// char* E_out_real_c = (char*)&E_out_real; -// E_out_real_c++; -// char* E_out_imag_c = (char*)&E_out_imag; -// E_out_imag_c++; -// -// short P_out_real = 0; -// short P_out_imag = 0; -// char* P_out_real_c = (char*)&P_out_real; -// P_out_real_c++; -// char* P_out_imag_c = (char*)&P_out_imag; -// P_out_imag_c++; -// -// short L_out_real = 0; -// short L_out_imag = 0; -// char* L_out_real_c = (char*)&L_out_real; -// L_out_real_c++; -// char* L_out_imag_c = (char*)&L_out_imag; -// L_out_imag_c++; -// -// volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points); -// volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points); -// -// //ORC implementation of 8ic_x5_cw_epl_corr_32fc_x3 is done in two different functions because it seems that -// //in one function the length of the code gives memory problems (bad access, segmentation fault). -// //Also, the maximum number of accumulators that can be used is 4 (and we need 6). -// //The "carrier wipe-off" step is done two times: one in the first function and another one in the second. -// //Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just -// //one time. -// -// *E_out = lv_cmake(*E_out_real_c, *E_out_imag_c); -// *P_out = lv_cmake(*P_out_real_c, *P_out_imag_c); -// *L_out = lv_cmake(*L_out_real_c, *L_out_imag_c); -//} -//#endif /* LV_HAVE_ORC */ -// -//#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */ +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 E_code_acc, P_code_acc, L_code_acc; + __m128i output_i_1, output_i_2, output_i_3, output_i_4; + __m128 output_ps_1, output_ps_2, output_ps_3, output_ps_4; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + E_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + output_i_1 = _mm_cvtepi8_epi32(output); + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output = _mm_srli_si128 (output, 4); + output_i_2 = _mm_cvtepi8_epi32(output); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output = _mm_srli_si128 (output, 4); + output_i_3 = _mm_cvtepi8_epi32(output); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output = _mm_srli_si128 (output, 4); + output_i_4 = _mm_cvtepi8_epi32(output); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_3); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_4); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + output_i_1 = _mm_cvtepi8_epi32(output); + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output = _mm_srli_si128 (output, 4); + output_i_2 = _mm_cvtepi8_epi32(output); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output = _mm_srli_si128 (output, 4); + output_i_3 = _mm_cvtepi8_epi32(output); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output = _mm_srli_si128 (output, 4); + output_i_4 = _mm_cvtepi8_epi32(output); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_3); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_4); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + output_i_1 = _mm_cvtepi8_epi32(output); + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output = _mm_srli_si128 (output, 4); + output_i_2 = _mm_cvtepi8_epi32(output); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output = _mm_srli_si128 (output, 4); + output_i_3 = _mm_cvtepi8_epi32(output); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output = _mm_srli_si128 (output, 4); + output_i_4 = _mm_cvtepi8_epi32(output); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_3); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_4); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 E_code_acc, P_code_acc, L_code_acc; + __m128i output_i_aux_1, output_i_aux_2, output_i_1, output_i_2, output_i_3, output_i_4; + __m128 output_ps_1, output_ps_2, output_ps_3, output_ps_4; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + E_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output = _mm_and_si128 (real_output, mult1); + imag_output = _mm_and_si128 (imag_output, mult1); + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_or_si128 (real_output, imag_output); + + output_i_aux_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), output); + output_i_aux_2 = _mm_unpackhi_epi8(_mm_setzero_si128(), output); + output_i_1 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_2 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_3 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_2); + output_i_4 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_2); + + output_i_1 = _mm_srai_epi32(output_i_1, 24); + output_i_2 = _mm_srai_epi32(output_i_2, 24); + output_i_3 = _mm_srai_epi32(output_i_3, 24); + output_i_4 = _mm_srai_epi32(output_i_4, 24); + + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_3); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_4); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output = _mm_and_si128 (real_output, mult1); + imag_output = _mm_and_si128 (imag_output, mult1); + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_or_si128 (real_output, imag_output); + + output_i_aux_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), output); + output_i_aux_2 = _mm_unpackhi_epi8(_mm_setzero_si128(), output); + output_i_1 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_2 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_3 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_2); + output_i_4 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_2); + + output_i_1 = _mm_srai_epi32(output_i_1, 24); + output_i_2 = _mm_srai_epi32(output_i_2, 24); + output_i_3 = _mm_srai_epi32(output_i_3, 24); + output_i_4 = _mm_srai_epi32(output_i_4, 24); + + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_3); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_4); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output = _mm_and_si128 (real_output, mult1); + imag_output = _mm_and_si128 (imag_output, mult1); + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_or_si128 (real_output, imag_output); + + output_i_aux_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), output); + output_i_aux_2 = _mm_unpackhi_epi8(_mm_setzero_si128(), output); + output_i_1 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_2 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_1); + output_i_3 = _mm_unpacklo_epi16(_mm_setzero_si128(), output_i_aux_2); + output_i_4 = _mm_unpackhi_epi16(_mm_setzero_si128(), output_i_aux_2); + + output_i_1 = _mm_srai_epi32(output_i_1, 24); + output_i_2 = _mm_srai_epi32(output_i_2, 24); + output_i_3 = _mm_srai_epi32(output_i_3, 24); + output_i_4 = _mm_srai_epi32(output_i_4, 24); + + output_ps_1 = _mm_cvtepi32_ps(output_i_1); + output_ps_2 = _mm_cvtepi32_ps(output_i_2); + output_ps_3 = _mm_cvtepi32_ps(output_i_3); + output_ps_4 = _mm_cvtepi32_ps(output_i_4); + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_3); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_4); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + L_code_ptr += 8; + P_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + lv_8sc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + } +} + +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */ \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt b/src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt index 197b5ed45..286038cca 100644 --- a/src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt +++ b/src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt @@ -515,11 +515,13 @@ endif() #MODIFICATIONS BY GNSS-SDR file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc) +file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt) #add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) -add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${orc}) +add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc}) source_group("Kernels" FILES ${h_files}) +source_group("Common Macros" FILES ${CommonMacros}) source_group("ORC Files" FILES ${orc}) #END OF MODIFICATIONS diff --git a/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc b/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc index d5241ec1b..080bed581 100644 --- a/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc +++ b/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc @@ -24,34 +24,46 @@ #include #include -//GNSS-SDR PROTO-KERNELS +//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1); + +//GNSS-SDR PROTO-KERNELS VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1); + +VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1); + VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1); - -VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 32768.0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); + + + -VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1); //VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000); //VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000);