1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-01-18 21:23:02 +00:00

Many changes to volk_gnsssdr

1) Added special protokernels for tracking.
2) Added macros to allow code reutilization inside protokernels.
This commit is contained in:
Andrés Cecilia Luque 2014-09-22 14:41:53 +02:00
parent e1b84e31f2
commit c12de026cf
14 changed files with 4914 additions and 742 deletions

View File

@ -105,12 +105,19 @@ int main(int argc, char *argv[]) {
//GNSS-SDR PROTO-KERNELS //GNSS-SDR PROTO-KERNELS
//lv_32fc_t sfv = lv_cmake((float)1, (float)2); //lv_32fc_t sfv = lv_cmake((float)1, (float)2);
//VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex); //example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 32768.0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
@ -129,6 +136,9 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
if(store_results) { if(store_results) {
char path[1024]; char path[1024];
volk_gnsssdr_get_config_path(path); volk_gnsssdr_get_config_path(path);

View File

@ -0,0 +1,108 @@
/*!
* \file CommonMacros.h
* \brief Common macros used inside the volk protokernels.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_CommonMacros_u_H
#define INCLUDED_gnsssdr_CommonMacros_u_H
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for U_SSE4_1
*/
#ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1
#define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\
imag = _mm_srli_si128 (input1, 2);\
imag = _mm_blend_epi16 (input2, imag, 85);\
real = _mm_slli_si128 (input2, 2);\
real = _mm_blend_epi16 (real, input1, 85);
#endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */
#ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1
#define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\
input_i_1 = _mm_cvtepi16_epi32(input);\
input = _mm_srli_si128 (input, 8);\
input_i_2 = _mm_cvtepi16_epi32(input);\
output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\
output_ps = _mm_cvtepi32_ps(output_i32);
#endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
/*!
\brief Macros for U_SSE2
*/
#ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2
#define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
realx_mult_realy = _mm_mullo_epi16 (realx, realy);\
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
#endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for U_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */
#ifndef INCLUDED_gnsssdr_CommonMacros_a_H
#define INCLUDED_gnsssdr_CommonMacros_a_H
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for A_SSE4_1
*/
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
/*!
\brief Macros for U_SSE2
*/
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for A_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */

View File

@ -0,0 +1,76 @@
/*!
* \file CommonMacros_16ic_cw_corr_32fc.h
* \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
#include "CommonMacros/CommonMacros.h"
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for U_SSE4_1
*/
#ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1
#define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
#endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for U_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */
#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for A_SSE4_1
*/
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for A_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */

View File

@ -0,0 +1,34 @@
####################################################################
Common Macros inside volk_gnsssdr module
####################################################################
First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties.
Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples.
####################################################################
Syntax
####################################################################
In order to allow better understanding of the code I created the macros with an specific syntax.
1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example:
example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output).
The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile.
2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h
####################################################################
Workflow
####################################################################
In order to use the macros easily, I usually test the code without macros inside a test proto-kernel, where you are able to test it, debug it and use breakpoints.
When it works I place it inside a macro an test it again.
####################################################################
Why macros
####################################################################
1) They are the only way I could find for sharing code between proto-kernels without performance penalty.
2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it.

View File

@ -0,0 +1,461 @@
/*!
* \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* Early, Prompt, and Late correlation with 32 bits vectors (16 bits the
* real part and 16 bits the imaginary part):
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 32 bits vectors) It returns the input
* signal in base band (BB)
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 32 bits vectors), accumulating the results
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 32 bits vectors), accumulating the results
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 32 bits vectors), accumulating the results
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
__m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
const lv_16sc_t* input_ptr = input;
const lv_16sc_t* carrier_ptr = carrier;
const lv_16sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_16sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_16sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_lddqu_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_lddqu_si128((__m128i*)input_ptr);
y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
//Adds the float 32 results
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
}
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t tmp1;
lv_16sc_t tmp2;
lv_16sc_t tmp3;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * E_code[i];
tmp2 = bb_signal_sample * P_code[i];
tmp3 = bb_signal_sample * L_code[i];
// Now get early, late, and prompt values for each
*E_out += (lv_32fc_t)tmp1;
*P_out += (lv_32fc_t)tmp2;
*L_out += (lv_32fc_t)tmp3;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
__m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
const lv_16sc_t* input_ptr = input;
const lv_16sc_t* carrier_ptr = carrier;
const lv_16sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_16sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_16sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_load_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_load_si128((__m128i*)input_ptr);
y1 = _mm_load_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_load_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y1 = _mm_load_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
//Adds the float 32 results
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_load_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_load_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
}
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t tmp1;
lv_16sc_t tmp2;
lv_16sc_t tmp3;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * E_code[i];
tmp2 = bb_signal_sample * P_code[i];
tmp3 = bb_signal_sample * L_code[i];
// Now get early, late, and prompt values for each
*E_out += (lv_32fc_t)tmp1;
*P_out += (lv_32fc_t)tmp2;
*L_out += (lv_32fc_t)tmp3;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */

View File

@ -0,0 +1,595 @@
/*!
* \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the
* real part and 16 bits the imaginary part) and accumulates into float32 values, returning them:
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 32 bits vectors) It returns the input
* signal in base band (BB)
* - Very Early values are calculated by multiplying the input signal in BB by the
* very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
* - Very Late values are calculated by multiplying the input signal in BB by the
* very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
__m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
float VE_out_real = 0;
float VE_out_imag = 0;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
float VL_out_real = 0;
float VL_out_imag = 0;
const lv_16sc_t* input_ptr = input;
const lv_16sc_t* carrier_ptr = carrier;
const lv_16sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_16sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_16sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_16sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_16sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
*VE_out_ptr = 0;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
*VL_out_ptr = 0;
real_VE_code_acc = _mm_setzero_ps();
imag_VE_code_acc = _mm_setzero_ps();
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
real_VL_code_acc = _mm_setzero_ps();
imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_lddqu_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_lddqu_si128((__m128i*)input_ptr);
y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
VE_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
VL_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
VE_code_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
VL_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t tmp1;
lv_16sc_t tmp2;
lv_16sc_t tmp3;
lv_16sc_t tmp4;
lv_16sc_t tmp5;
bb_signal_sample = lv_cmake(0, 0);
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * VE_code[i];
tmp2 = bb_signal_sample * E_code[i];
tmp3 = bb_signal_sample * P_code[i];
tmp4 = bb_signal_sample * L_code[i];
tmp5 = bb_signal_sample * VL_code[i];
// Now get early, late, and prompt values for each
*VE_out += (lv_32fc_t)tmp1;
*E_out += (lv_32fc_t)tmp2;
*P_out += (lv_32fc_t)tmp3;
*L_out += (lv_32fc_t)tmp4;
*VL_out += (lv_32fc_t)tmp5;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
__m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
float VE_out_real = 0;
float VE_out_imag = 0;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
float VL_out_real = 0;
float VL_out_imag = 0;
const lv_16sc_t* input_ptr = input;
const lv_16sc_t* carrier_ptr = carrier;
const lv_16sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_16sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_16sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_16sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_16sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
*VE_out_ptr = 0;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
*VL_out_ptr = 0;
real_VE_code_acc = _mm_setzero_ps();
imag_VE_code_acc = _mm_setzero_ps();
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
real_VL_code_acc = _mm_setzero_ps();
imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_load_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_load_si128((__m128i*)input_ptr);
y1 = _mm_load_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_load_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y1 = _mm_load_si128((__m128i*)VE_code_ptr);
VE_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)VE_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y1 = _mm_load_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_load_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_load_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y1 = _mm_load_si128((__m128i*)VL_code_ptr);
VL_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)VL_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
VE_code_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
VL_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t tmp1;
lv_16sc_t tmp2;
lv_16sc_t tmp3;
lv_16sc_t tmp4;
lv_16sc_t tmp5;
bb_signal_sample = lv_cmake(0, 0);
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * VE_code[i];
tmp2 = bb_signal_sample * E_code[i];
tmp3 = bb_signal_sample * P_code[i];
tmp4 = bb_signal_sample * L_code[i];
tmp5 = bb_signal_sample * VL_code[i];
// Now get early, late, and prompt values for each
*VE_out += (lv_32fc_t)tmp1;
*E_out += (lv_32fc_t)tmp2;
*P_out += (lv_32fc_t)tmp3;
*L_out += (lv_32fc_t)tmp4;
*VL_out += (lv_32fc_t)tmp5;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */

View File

@ -0,0 +1,302 @@
#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
\note Input buffer does NOT need to be properly aligned
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
float min_val = -32768;
float max_val = 32767;
float r;
__m128 vScalar = _mm_set_ps1(scalar);
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(;number < eighthPoints; number++){
inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
// Scale and clip
ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
number = eighthPoints * 8;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
outputVector[number] = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
\note Input buffer does NOT need to be properly aligned
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
float min_val = -32768;
float max_val = 32767;
float r;
__m128 vScalar = _mm_set_ps1(scalar);
__m128 ret;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
for(;number < quarterPoints; number++){
ret = _mm_loadu_ps(inputVectorPtr);
inputVectorPtr += 4;
// Scale and clip
ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
_mm_store_ps(outputFloatBuffer, ret);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
}
number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
outputVector[number] = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
\note Input buffer does NOT need to be properly aligned
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
int16_t* outputVectorPtr = outputVector;
const float* inputVectorPtr = inputVector;
unsigned int number = 0;
float min_val = -32768;
float max_val = 32767;
float r;
for(number = 0; number < num_points; number++){
r = *inputVectorPtr++ * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
*outputVectorPtr++ = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */
#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
float min_val = -32768;
float max_val = 32767;
float r;
__m128 vScalar = _mm_set_ps1(scalar);
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(;number < eighthPoints; number++){
inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
// Scale and clip
ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
number = eighthPoints * 8;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
outputVector[number] = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
float min_val = -32768;
float max_val = 32767;
float r;
__m128 vScalar = _mm_set_ps1(scalar);
__m128 ret;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
for(;number < quarterPoints; number++){
ret = _mm_load_ps(inputVectorPtr);
inputVectorPtr += 4;
// Scale and clip
ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
_mm_store_ps(outputFloatBuffer, ret);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
}
number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
outputVector[number] = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
int16_t* outputVectorPtr = outputVector;
const float* inputVectorPtr = inputVector;
unsigned int number = 0;
float min_val = -32768;
float max_val = 32767;
float r;
for(number = 0; number < num_points; number++){
r = *inputVectorPtr++ * scalar;
if(r < min_val)
r = min_val;
else if(r > max_val)
r = max_val;
*outputVectorPtr++ = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */

View File

@ -0,0 +1,297 @@
/*!
* \file volk_gnsssdr_32fc_convert_16ic.h
* \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that implements an accumulator of char values
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/4;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
for(unsigned int i = 0; i < (num_points%4)*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/4;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
for(unsigned int i = 0; i < (num_points%4)*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
for(unsigned int i = 0; i < num_points*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/4;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
for(unsigned int i = 0; i < (num_points%4)*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/4;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
for(unsigned int i = 0; i < (num_points%4)*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
for(unsigned int i = 0; i < num_points*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */

View File

@ -57,13 +57,180 @@
#include <float.h> #include <float.h>
#include <string.h> #include <string.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*! /*!
* TODO: Code the SSE4 version and benchmark it \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code VE PRN code replica input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param VL_code VL PRN code replica input
\param VE_out VE correlation output
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param VL_out VL correlation output
\param num_points The number of complex values in vectors
*/ */
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 4;
lv_32fc_t dotProduct_VE;
lv_32fc_t dotProduct_E;
lv_32fc_t dotProduct_P;
lv_32fc_t dotProduct_L;
lv_32fc_t dotProduct_VL;
// Aux vars
__m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
__m256 bb_signal_sample, bb_signal_sample_shuffled;
z_VE = _mm256_setzero_ps();
z_E = _mm256_setzero_ps();
z_P = _mm256_setzero_ps();
z_L = _mm256_setzero_ps();
z_VL = _mm256_setzero_ps();
//input and output vectors
const lv_32fc_t* _input = input;
const lv_32fc_t* _carrier = carrier;
const lv_32fc_t* _VE_code = VE_code;
const lv_32fc_t* _E_code = E_code;
const lv_32fc_t* _P_code = P_code;
const lv_32fc_t* _L_code = L_code;
const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++)
{
// carrier wipe-off (vector point-to-point product)
x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product)
// VE
y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
// Early
y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt
y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
// Late
y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
// VL
y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/
_carrier += 4;
_input += 4;
_VE_code += 4;
_E_code += 4;
_P_code += 4;
_L_code += 4;
_VL_code += 4;
}
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
_mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
_mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
_mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
_mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
_mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
for (int i = 0; i<(num_points % 4); ++i)
{
dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
}
*VE_out = dotProduct_VE;
*E_out = dotProduct_E;
*P_out = dotProduct_P;
*L_out = dotProduct_L;
*VL_out = dotProduct_VL;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
/*! /*!
\brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input \param input The input signal input
@ -86,15 +253,10 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V
const unsigned int halfPoints = num_points / 2; const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_VE; lv_32fc_t dotProduct_VE;
memset(&dotProduct_VE, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_E; lv_32fc_t dotProduct_E;
memset(&dotProduct_E, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_P; lv_32fc_t dotProduct_P;
memset(&dotProduct_P, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_L; lv_32fc_t dotProduct_L;
memset(&dotProduct_L, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_VL; lv_32fc_t dotProduct_VL;
memset(&dotProduct_VL, 0x0, 2*sizeof(float));
// Aux vars // Aux vars
__m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
@ -217,11 +379,11 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V
_mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
_mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector _mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
dotProduct_VE += ( dotProductVector_VE[0] + dotProductVector_VE[1] ); dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
dotProduct_VL += ( dotProductVector_VL[0] + dotProductVector_VL[1] ); dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
if((num_points % 2) != 0) if((num_points % 2) != 0)
{ {
@ -238,7 +400,6 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* V
*L_out = dotProduct_L; *L_out = dotProduct_L;
*VL_out = dotProduct_VL; *VL_out = dotProduct_VL;
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -297,6 +458,178 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t*
#include <float.h> #include <float.h>
#include <string.h> #include <string.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code VE PRN code replica input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param VL_code VL PRN code replica input
\param VE_out VE correlation output
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param VL_out VL correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 4;
lv_32fc_t dotProduct_VE;
lv_32fc_t dotProduct_E;
lv_32fc_t dotProduct_P;
lv_32fc_t dotProduct_L;
lv_32fc_t dotProduct_VL;
// Aux vars
__m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
__m256 bb_signal_sample, bb_signal_sample_shuffled;
z_VE = _mm256_setzero_ps();
z_E = _mm256_setzero_ps();
z_P = _mm256_setzero_ps();
z_L = _mm256_setzero_ps();
z_VL = _mm256_setzero_ps();
//input and output vectors
const lv_32fc_t* _input = input;
const lv_32fc_t* _carrier = carrier;
const lv_32fc_t* _VE_code = VE_code;
const lv_32fc_t* _E_code = E_code;
const lv_32fc_t* _P_code = P_code;
const lv_32fc_t* _L_code = L_code;
const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++)
{
// carrier wipe-off (vector point-to-point product)
x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product)
// VE
y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
// Early
y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt
y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
// Late
y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
// VL
y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/
_carrier += 4;
_input += 4;
_VE_code += 4;
_E_code += 4;
_P_code += 4;
_L_code += 4;
_VL_code += 4;
}
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
_mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
for (int i = 0; i<(num_points % 4); ++i)
{
dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
}
*VE_out = dotProduct_VE;
*E_out = dotProduct_E;
*P_out = dotProduct_P;
*L_out = dotProduct_L;
*VL_out = dotProduct_VL;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
/*! /*!
@ -321,15 +654,10 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V
const unsigned int halfPoints = num_points / 2; const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_VE; lv_32fc_t dotProduct_VE;
memset(&dotProduct_VE, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_E; lv_32fc_t dotProduct_E;
memset(&dotProduct_E, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_P; lv_32fc_t dotProduct_P;
memset(&dotProduct_P, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_L; lv_32fc_t dotProduct_L;
memset(&dotProduct_L, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_VL; lv_32fc_t dotProduct_VL;
memset(&dotProduct_VL, 0x0, 2*sizeof(float));
// Aux vars // Aux vars
__m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
@ -452,11 +780,11 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V
_mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
_mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
dotProduct_VE += ( dotProductVector_VE[0] + dotProductVector_VE[1] ); dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
dotProduct_VL += ( dotProductVector_VL[0] + dotProductVector_VL[1] ); dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
if((num_points % 2) != 0) if((num_points % 2) != 0)
{ {
@ -472,9 +800,7 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* V
*P_out = dotProduct_P; *P_out = dotProduct_P;
*L_out = dotProduct_L; *L_out = dotProduct_L;
*VL_out = dotProduct_VL; *VL_out = dotProduct_VL;
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -518,7 +844,5 @@ static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t
*VL_out += bb_signal_sample * VL_code[i]; *VL_out += bb_signal_sample * VL_code[i];
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */ #endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */

View File

@ -0,0 +1,243 @@
/*!
* \file volk_gnsssdr_64f_accumulator_64f.h
* \brief Volk protokernel: 64 bits (double) scalar accumulator
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that implements an accumulator of char values
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){
double returnValue = 0;
const unsigned int sse_iters = num_points / 4;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm256_loadu_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal);
aPtr += 4;
}
_mm256_storeu_pd((double*)tempBuffer,accumulator);
for(int i = 0; i<4; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 4); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <xmmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){
double returnValue = 0;
const unsigned int sse_iters = num_points / 2;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm_loadu_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal);
aPtr += 2;
}
_mm_storeu_pd((double*)tempBuffer,accumulator);
for(int i = 0; i<2; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 2); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){
const double* aPtr = inputBuffer;
double returnValue = 0;
for(unsigned int number = 0;number < num_points; number++){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */
#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){
double returnValue = 0;
const unsigned int sse_iters = num_points / 4;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm256_load_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal);
aPtr += 4;
}
_mm256_store_pd((double*)tempBuffer,accumulator);
for(int i = 0; i<4; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 4); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <xmmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){
double returnValue = 0;
const unsigned int sse_iters = num_points / 2;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm_load_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal);
aPtr += 2;
}
_mm_store_pd((double*)tempBuffer,accumulator);
for(int i = 0; i<2; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 2); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){
const double* aPtr = inputBuffer;
double returnValue = 0;
for(unsigned int number = 0;number < num_points; number++){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */

View File

@ -515,11 +515,13 @@ endif()
#MODIFICATIONS BY GNSS-SDR #MODIFICATIONS BY GNSS-SDR
file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc) file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc)
file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt)
#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) #add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${orc}) add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc})
source_group("Kernels" FILES ${h_files}) source_group("Kernels" FILES ${h_files})
source_group("Common Macros" FILES ${CommonMacros})
source_group("ORC Files" FILES ${orc}) source_group("ORC Files" FILES ${orc})
#END OF MODIFICATIONS #END OF MODIFICATIONS

View File

@ -24,34 +24,46 @@
#include <volk_gnsssdr/volk_gnsssdr.h> #include <volk_gnsssdr/volk_gnsssdr.h>
#include <boost/test/unit_test.hpp> #include <boost/test/unit_test.hpp>
//GNSS-SDR PROTO-KERNELS //VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1);
//GNSS-SDR PROTO-KERNELS
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 32768.0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000); //VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000); //VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000);