mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2024-12-14 12:10:34 +00:00
Improved 8ic_x7_cw_epl_corr_32fc_x5 protokernels
Improved 8ic_x7_cw_epl_corr_32fc_x5 protokernels and added a new protokernel called volk_gnsssdr_32fc_convert_8ic in order to test the 8 bits protokernels in the tracking.
This commit is contained in:
parent
badd9cd64d
commit
ea3cd84e20
@ -106,8 +106,10 @@ int main(int argc, char *argv[]) {
|
||||
//GNSS-SDR PROTO-KERNELS
|
||||
//lv_32fc_t sfv = lv_cmake((float)1, (float)2);
|
||||
//example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
|
||||
|
||||
VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_epl_corr_safe_32fc_x5, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_epl_corr_unsafe_32fc_x5, 1e-4, 0, 30, 1, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_epl_corr_unsafe_32fc_x5, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_epl_corr_TEST_32fc_x5, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 204602, 250, &results, benchmark_mode, kernel_regex);
|
||||
|
||||
|
@ -55,12 +55,18 @@
|
||||
#endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
|
||||
|
||||
#ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1
|
||||
#define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\
|
||||
#define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
|
||||
input_i_1 = _mm_cvtepi8_epi32(input);\
|
||||
input = _mm_srli_si128 (input, 4);\
|
||||
input_i_2 = _mm_cvtepi8_epi32(input);\
|
||||
input = _mm_srli_si128 (input, 4);\
|
||||
output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\
|
||||
output_i32_1 = _mm_add_epi32 (input_i_1, input_i_2);\
|
||||
input_i_1 = _mm_cvtepi8_epi32(input);\
|
||||
input = _mm_srli_si128 (input, 4);\
|
||||
input_i_2 = _mm_cvtepi8_epi32(input);\
|
||||
input = _mm_srli_si128 (input, 4);\
|
||||
output_i32_2 = _mm_add_epi32 (input_i_1, input_i_2);\
|
||||
output_i32 = _mm_add_epi32 (output_i32_1, output_i32_2);\
|
||||
output_ps = _mm_cvtepi32_ps(output_i32);
|
||||
#endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
|
||||
|
||||
|
@ -39,15 +39,14 @@
|
||||
*/
|
||||
|
||||
#ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1
|
||||
#define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\
|
||||
#define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
|
||||
\
|
||||
imag_output = _mm_slli_si128 (imag_output, 1);\
|
||||
output = _mm_blendv_epi8 (imag_output, real_output, mult1);\
|
||||
\
|
||||
CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_ps_1)\
|
||||
CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_ps_2)
|
||||
CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
#endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
|
||||
|
||||
#ifndef CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1
|
||||
|
@ -5,8 +5,6 @@
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that implements an accumulator of char values
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
|
||||
|
@ -0,0 +1,213 @@
|
||||
/*!
|
||||
* \file volk_gnsssdr_32fc_convert_8ic.h
|
||||
* \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
|
||||
*
|
||||
* GNSS-SDR is a software defined Global Navigation
|
||||
* Satellite Systems receiver
|
||||
*
|
||||
* This file is part of GNSS-SDR.
|
||||
*
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE2
|
||||
#include <emmintrin.h>
|
||||
/*!
|
||||
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
|
||||
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
|
||||
__m128i int8InputVal;
|
||||
__m128 ret1, ret2, ret3, ret4;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */
|
||||
|
||||
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
|
||||
|
||||
#include <volk/volk_common.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE2
|
||||
#include <emmintrin.h>
|
||||
/*!
|
||||
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
|
||||
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
|
||||
__m128i int8InputVal;
|
||||
__m128 ret1, ret2, ret3, ret4;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */
|
@ -77,8 +77,8 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
|
||||
|
||||
__m128 E_code_acc, P_code_acc, L_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32;
|
||||
__m128 output_ps_1, output_ps_2;
|
||||
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
|
||||
__m128 output_ps;
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
@ -116,26 +116,23 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E
|
||||
//Get early values
|
||||
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
@ -365,8 +362,8 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
|
||||
|
||||
__m128 E_code_acc, P_code_acc, L_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32;
|
||||
__m128 output_ps_1, output_ps_2;
|
||||
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
|
||||
__m128 output_ps;
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
@ -404,26 +401,23 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E
|
||||
//Get early values
|
||||
y = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*!
|
||||
* \file volk_gnsssdr_8ic_x7_cw_epl_corr_32fc_x5.h
|
||||
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, input and carrier must be values between —1 and 1 (1 bit) and XX_code must be values between —7 and 7 (3 bits).
|
||||
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 (2 bits).
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
@ -37,6 +37,7 @@
|
||||
* XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 8 bits = XX_code and bb_signal_sample must be values between —7 and 7 to avoid overflow (3 bits)
|
||||
*
|
||||
* conclusion = input and carrier must be values between —1 and 1 (1 bit) and XX_code must be values between —7 and 7 to avoid overflow (3 bits)
|
||||
* If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 to avoid overflow (2 bits).
|
||||
* -------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
|
||||
@ -99,8 +100,8 @@ static inline void volk_gnsssdr_8ic_x7_cw_epl_corr_32fc_x5_u_sse4_1(lv_32fc_t* V
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
|
||||
|
||||
__m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32;
|
||||
__m128 output_ps_1, output_ps_2;
|
||||
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
|
||||
__m128 output_ps;
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
@ -146,42 +147,37 @@ static inline void volk_gnsssdr_8ic_x7_cw_epl_corr_32fc_x5_u_sse4_1(lv_32fc_t* V
|
||||
//Get very early values
|
||||
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
|
||||
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
|
||||
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
|
||||
|
||||
//Get early values
|
||||
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
|
||||
|
||||
//Get very late values
|
||||
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
|
||||
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
|
||||
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
@ -471,8 +467,8 @@ static inline void volk_gnsssdr_8ic_x7_cw_epl_corr_32fc_x5_a_sse4_1(lv_32fc_t* V
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
|
||||
|
||||
__m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32;
|
||||
__m128 output_ps_1, output_ps_2;
|
||||
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
|
||||
__m128 output_ps;
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
@ -518,42 +514,37 @@ static inline void volk_gnsssdr_8ic_x7_cw_epl_corr_32fc_x5_a_sse4_1(lv_32fc_t* V
|
||||
//Get very early values
|
||||
y = _mm_load_si128((__m128i*)VE_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
|
||||
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
|
||||
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
|
||||
|
||||
//Get early values
|
||||
y = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
|
||||
|
||||
//Get very late values
|
||||
y = _mm_load_si128((__m128i*)VL_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
|
||||
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
|
||||
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
|
@ -50,6 +50,7 @@ VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1);
|
||||
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1);
|
||||
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
|
||||
|
Loading…
Reference in New Issue
Block a user