mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-02-19 04:20:08 +00:00
Code cleaning
This commit is contained in:
parent
f7f495175e
commit
51427f046a
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
|
||||
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that performs the carrier wipe-off mixing and the
|
||||
@ -30,7 +30,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -56,8 +56,8 @@
|
||||
#include <smmintrin.h>
|
||||
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
|
||||
#include "CommonMacros/CommonMacros.h"
|
||||
/*!
|
||||
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
|
||||
|
||||
/*! \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
|
||||
\param input The input signal input
|
||||
\param carrier The carrier signal input
|
||||
\param E_code Early PRN code replica input
|
||||
@ -67,142 +67,141 @@
|
||||
\param P_out Early correlation output
|
||||
\param L_out Early correlation output
|
||||
\param num_points The number of complex values in vectors
|
||||
*/
|
||||
*/
|
||||
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
|
||||
|
||||
|
||||
__m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32;
|
||||
__m128 real_output_ps, imag_output_ps;
|
||||
|
||||
|
||||
float E_out_real = 0;
|
||||
float E_out_imag = 0;
|
||||
float P_out_real = 0;
|
||||
float P_out_imag = 0;
|
||||
float L_out_real = 0;
|
||||
float L_out_imag = 0;
|
||||
|
||||
|
||||
const lv_16sc_t* input_ptr = input;
|
||||
const lv_16sc_t* carrier_ptr = carrier;
|
||||
|
||||
|
||||
const lv_16sc_t* E_code_ptr = E_code;
|
||||
lv_32fc_t* E_out_ptr = E_out;
|
||||
const lv_16sc_t* L_code_ptr = L_code;
|
||||
lv_32fc_t* L_out_ptr = L_out;
|
||||
const lv_16sc_t* P_code_ptr = P_code;
|
||||
lv_32fc_t* P_out_ptr = P_out;
|
||||
|
||||
|
||||
*E_out_ptr = 0;
|
||||
*P_out_ptr = 0;
|
||||
*L_out_ptr = 0;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
real_E_code_acc = _mm_setzero_ps();
|
||||
imag_E_code_acc = _mm_setzero_ps();
|
||||
real_P_code_acc = _mm_setzero_ps();
|
||||
imag_P_code_acc = _mm_setzero_ps();
|
||||
real_L_code_acc = _mm_setzero_ps();
|
||||
imag_L_code_acc = _mm_setzero_ps();
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
//Perform the carrier wipe-off
|
||||
x1 = _mm_lddqu_si128((__m128i*)input_ptr);
|
||||
input_ptr += 4;
|
||||
x2 = _mm_lddqu_si128((__m128i*)input_ptr);
|
||||
|
||||
y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
|
||||
carrier_ptr += 4;
|
||||
y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
|
||||
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
E_code_ptr += 4;
|
||||
y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
//Adds the float 32 results
|
||||
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
|
||||
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
P_code_ptr += 4;
|
||||
y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
|
||||
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
|
||||
|
||||
//Get late values
|
||||
y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
L_code_ptr += 4;
|
||||
y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
|
||||
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
|
||||
|
||||
input_ptr += 4;
|
||||
carrier_ptr += 4;
|
||||
E_code_ptr += 4;
|
||||
P_code_ptr += 4;
|
||||
L_code_ptr += 4;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
|
||||
|
||||
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<4; ++i)
|
||||
{
|
||||
E_out_real += real_E_dotProductVector[i];
|
||||
E_out_imag += imag_E_dotProductVector[i];
|
||||
P_out_real += real_P_dotProductVector[i];
|
||||
P_out_imag += imag_P_dotProductVector[i];
|
||||
L_out_real += real_L_dotProductVector[i];
|
||||
L_out_imag += imag_L_dotProductVector[i];
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
x1 = _mm_lddqu_si128((__m128i*)input_ptr);
|
||||
input_ptr += 4;
|
||||
x2 = _mm_lddqu_si128((__m128i*)input_ptr);
|
||||
|
||||
y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
|
||||
carrier_ptr += 4;
|
||||
y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
|
||||
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
E_code_ptr += 4;
|
||||
y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
//Adds the float 32 results
|
||||
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
|
||||
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
P_code_ptr += 4;
|
||||
y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
|
||||
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
|
||||
|
||||
//Get late values
|
||||
y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
L_code_ptr += 4;
|
||||
y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
|
||||
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
|
||||
|
||||
input_ptr += 4;
|
||||
carrier_ptr += 4;
|
||||
E_code_ptr += 4;
|
||||
P_code_ptr += 4;
|
||||
L_code_ptr += 4;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
|
||||
|
||||
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<4; ++i)
|
||||
{
|
||||
E_out_real += real_E_dotProductVector[i];
|
||||
E_out_imag += imag_E_dotProductVector[i];
|
||||
P_out_real += real_P_dotProductVector[i];
|
||||
P_out_imag += imag_P_dotProductVector[i];
|
||||
L_out_real += real_L_dotProductVector[i];
|
||||
L_out_imag += imag_L_dotProductVector[i];
|
||||
}
|
||||
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
|
||||
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
|
||||
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
|
||||
}
|
||||
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
|
||||
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
|
||||
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
|
||||
}
|
||||
|
||||
|
||||
lv_16sc_t bb_signal_sample;
|
||||
for(unsigned int i=0; i < num_points%8; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
@ -225,30 +224,31 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E
|
||||
lv_16sc_t tmp1;
|
||||
lv_16sc_t tmp2;
|
||||
lv_16sc_t tmp3;
|
||||
|
||||
|
||||
bb_signal_sample = lv_cmake(0, 0);
|
||||
|
||||
|
||||
*E_out = 0;
|
||||
*P_out = 0;
|
||||
*L_out = 0;
|
||||
// perform Early, Prompt and Late correlation
|
||||
|
||||
|
||||
for(unsigned int i=0; i < num_points; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
|
||||
tmp1 = bb_signal_sample * E_code[i];
|
||||
tmp2 = bb_signal_sample * P_code[i];
|
||||
tmp3 = bb_signal_sample * L_code[i];
|
||||
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += (lv_32fc_t)tmp1;
|
||||
*P_out += (lv_32fc_t)tmp2;
|
||||
*L_out += (lv_32fc_t)tmp3;
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
|
||||
tmp1 = bb_signal_sample * E_code[i];
|
||||
tmp2 = bb_signal_sample * P_code[i];
|
||||
tmp3 = bb_signal_sample * L_code[i];
|
||||
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += (lv_32fc_t)tmp1;
|
||||
*P_out += (lv_32fc_t)tmp2;
|
||||
*L_out += (lv_32fc_t)tmp3;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */
|
||||
|
||||
|
||||
@ -280,138 +280,137 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E
|
||||
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
|
||||
|
||||
|
||||
__m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32;
|
||||
__m128 real_output_ps, imag_output_ps;
|
||||
|
||||
|
||||
float E_out_real = 0;
|
||||
float E_out_imag = 0;
|
||||
float P_out_real = 0;
|
||||
float P_out_imag = 0;
|
||||
float L_out_real = 0;
|
||||
float L_out_imag = 0;
|
||||
|
||||
|
||||
const lv_16sc_t* input_ptr = input;
|
||||
const lv_16sc_t* carrier_ptr = carrier;
|
||||
|
||||
|
||||
const lv_16sc_t* E_code_ptr = E_code;
|
||||
lv_32fc_t* E_out_ptr = E_out;
|
||||
const lv_16sc_t* L_code_ptr = L_code;
|
||||
lv_32fc_t* L_out_ptr = L_out;
|
||||
const lv_16sc_t* P_code_ptr = P_code;
|
||||
lv_32fc_t* P_out_ptr = P_out;
|
||||
|
||||
|
||||
*E_out_ptr = 0;
|
||||
*P_out_ptr = 0;
|
||||
*L_out_ptr = 0;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
real_E_code_acc = _mm_setzero_ps();
|
||||
imag_E_code_acc = _mm_setzero_ps();
|
||||
real_P_code_acc = _mm_setzero_ps();
|
||||
imag_P_code_acc = _mm_setzero_ps();
|
||||
real_L_code_acc = _mm_setzero_ps();
|
||||
imag_L_code_acc = _mm_setzero_ps();
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
//Perform the carrier wipe-off
|
||||
x1 = _mm_load_si128((__m128i*)input_ptr);
|
||||
input_ptr += 4;
|
||||
x2 = _mm_load_si128((__m128i*)input_ptr);
|
||||
|
||||
y1 = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
carrier_ptr += 4;
|
||||
y2 = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
|
||||
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y1 = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
E_code_ptr += 4;
|
||||
y2 = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
//Adds the float 32 results
|
||||
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
|
||||
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y1 = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
P_code_ptr += 4;
|
||||
y2 = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
|
||||
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
|
||||
|
||||
//Get late values
|
||||
y1 = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
L_code_ptr += 4;
|
||||
y2 = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
|
||||
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
|
||||
|
||||
input_ptr += 4;
|
||||
carrier_ptr += 4;
|
||||
E_code_ptr += 4;
|
||||
P_code_ptr += 4;
|
||||
L_code_ptr += 4;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
|
||||
|
||||
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<4; ++i)
|
||||
{
|
||||
E_out_real += real_E_dotProductVector[i];
|
||||
E_out_imag += imag_E_dotProductVector[i];
|
||||
P_out_real += real_P_dotProductVector[i];
|
||||
P_out_imag += imag_P_dotProductVector[i];
|
||||
L_out_real += real_L_dotProductVector[i];
|
||||
L_out_imag += imag_L_dotProductVector[i];
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
x1 = _mm_load_si128((__m128i*)input_ptr);
|
||||
input_ptr += 4;
|
||||
x2 = _mm_load_si128((__m128i*)input_ptr);
|
||||
|
||||
y1 = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
carrier_ptr += 4;
|
||||
y2 = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
|
||||
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y1 = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
E_code_ptr += 4;
|
||||
y2 = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
//Adds the float 32 results
|
||||
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
|
||||
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y1 = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
P_code_ptr += 4;
|
||||
y2 = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
|
||||
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
|
||||
|
||||
//Get late values
|
||||
y1 = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
L_code_ptr += 4;
|
||||
y2 = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
|
||||
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
|
||||
|
||||
input_ptr += 4;
|
||||
carrier_ptr += 4;
|
||||
E_code_ptr += 4;
|
||||
P_code_ptr += 4;
|
||||
L_code_ptr += 4;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
|
||||
|
||||
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<4; ++i)
|
||||
{
|
||||
E_out_real += real_E_dotProductVector[i];
|
||||
E_out_imag += imag_E_dotProductVector[i];
|
||||
P_out_real += real_P_dotProductVector[i];
|
||||
P_out_imag += imag_P_dotProductVector[i];
|
||||
L_out_real += real_L_dotProductVector[i];
|
||||
L_out_imag += imag_L_dotProductVector[i];
|
||||
}
|
||||
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
|
||||
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
|
||||
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
|
||||
}
|
||||
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
|
||||
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
|
||||
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
|
||||
}
|
||||
|
||||
|
||||
lv_16sc_t bb_signal_sample;
|
||||
for(unsigned int i=0; i < num_points%8; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
@ -434,28 +433,29 @@ static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t*
|
||||
lv_16sc_t tmp1;
|
||||
lv_16sc_t tmp2;
|
||||
lv_16sc_t tmp3;
|
||||
|
||||
|
||||
bb_signal_sample = lv_cmake(0, 0);
|
||||
|
||||
|
||||
*E_out = 0;
|
||||
*P_out = 0;
|
||||
*L_out = 0;
|
||||
// perform Early, Prompt and Late correlation
|
||||
|
||||
|
||||
for(unsigned int i=0; i < num_points; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
|
||||
tmp1 = bb_signal_sample * E_code[i];
|
||||
tmp2 = bb_signal_sample * P_code[i];
|
||||
tmp3 = bb_signal_sample * L_code[i];
|
||||
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += (lv_32fc_t)tmp1;
|
||||
*P_out += (lv_32fc_t)tmp2;
|
||||
*L_out += (lv_32fc_t)tmp3;
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
|
||||
tmp1 = bb_signal_sample * E_code[i];
|
||||
tmp2 = bb_signal_sample * P_code[i];
|
||||
tmp3 = bb_signal_sample * L_code[i];
|
||||
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += (lv_32fc_t)tmp1;
|
||||
*P_out += (lv_32fc_t)tmp2;
|
||||
*L_out += (lv_32fc_t)tmp3;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
|
||||
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values.
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that performs the carrier wipe-off mixing and the
|
||||
@ -34,7 +34,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_32fc_convert_16ic.h
|
||||
* \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
@ -17,7 +17,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -47,43 +47,44 @@
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
const unsigned int sse_iters = num_points/4;
|
||||
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||
|
||||
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
|
||||
|
||||
__m128 inputVal1, inputVal2;
|
||||
__m128i intInputVal1, intInputVal2;
|
||||
__m128 ret1, ret2;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -97,43 +98,44 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
const unsigned int sse_iters = num_points/4;
|
||||
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||
|
||||
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
|
||||
|
||||
__m128 inputVal1, inputVal2;
|
||||
__m128i intInputVal1, intInputVal2;
|
||||
__m128 ret1, ret2;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
@ -149,14 +151,15 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
|
||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */
|
||||
@ -180,43 +183,45 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
const unsigned int sse_iters = num_points/4;
|
||||
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||
|
||||
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
|
||||
|
||||
__m128 inputVal1, inputVal2;
|
||||
__m128i intInputVal1, intInputVal2;
|
||||
__m128 ret1, ret2;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -228,45 +233,48 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points/4;
|
||||
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||
|
||||
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
|
||||
|
||||
__m128 inputVal1, inputVal2;
|
||||
__m128i intInputVal1, intInputVal2;
|
||||
__m128 ret1, ret2;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
@ -277,19 +285,21 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_32fc_convert_8ic.h
|
||||
* \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
@ -17,7 +17,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -45,54 +45,57 @@
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
|
||||
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
|
||||
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
|
||||
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
|
||||
__m128i int8InputVal;
|
||||
__m128 ret1, ret2, ret3, ret4;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -103,19 +106,21 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */
|
||||
@ -137,54 +142,57 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
|
||||
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
|
||||
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
|
||||
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
|
||||
__m128i int8InputVal;
|
||||
__m128 ret1, ret2, ret3, ret4;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -195,19 +203,21 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++){
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_32fc_s32f_convert_8ic.h
|
||||
* \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
@ -17,7 +17,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -45,61 +45,64 @@
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
__m128 invScalar = _mm_set_ps1(1.0/scalar);
|
||||
|
||||
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
|
||||
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
|
||||
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
|
||||
__m128i int8InputVal;
|
||||
__m128 ret1, ret2, ret3, ret4;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
inputVal1 = _mm_mul_ps(inputVal1, invScalar);
|
||||
inputVal2 = _mm_mul_ps(inputVal2, invScalar);
|
||||
inputVal3 = _mm_mul_ps(inputVal3, invScalar);
|
||||
inputVal4 = _mm_mul_ps(inputVal4, invScalar);
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
inputVal1 = _mm_mul_ps(inputVal1, invScalar);
|
||||
inputVal2 = _mm_mul_ps(inputVal2, invScalar);
|
||||
inputVal3 = _mm_mul_ps(inputVal3, invScalar);
|
||||
inputVal4 = _mm_mul_ps(inputVal4, invScalar);
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
float scaled = 0;
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++){
|
||||
scaled = inputVectorPtr[i]/scalar;
|
||||
if(scaled > max_val)
|
||||
scaled = max_val;
|
||||
else if(scaled < min_val)
|
||||
scaled = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(scaled);
|
||||
}
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++)
|
||||
{
|
||||
scaled = inputVectorPtr[i]/scalar;
|
||||
if(scaled > max_val)
|
||||
scaled = max_val;
|
||||
else if(scaled < min_val)
|
||||
scaled = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(scaled);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -110,21 +113,23 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVec
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points)
|
||||
{
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
float scaled = 0;
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++){
|
||||
scaled = (inputVectorPtr[i])/scalar;
|
||||
if(scaled > max_val)
|
||||
scaled = max_val;
|
||||
else if(scaled < min_val)
|
||||
scaled = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(scaled);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++)
|
||||
{
|
||||
scaled = (inputVectorPtr[i])/scalar;
|
||||
if(scaled > max_val)
|
||||
scaled = max_val;
|
||||
else if(scaled < min_val)
|
||||
scaled = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(scaled);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */
|
||||
@ -146,61 +151,64 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVe
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
__m128 invScalar = _mm_set_ps1(1.0/scalar);
|
||||
|
||||
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
|
||||
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
|
||||
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
|
||||
__m128i int8InputVal;
|
||||
__m128 ret1, ret2, ret3, ret4;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
inputVal1 = _mm_mul_ps(inputVal1, invScalar);
|
||||
inputVal2 = _mm_mul_ps(inputVal2, invScalar);
|
||||
inputVal3 = _mm_mul_ps(inputVal3, invScalar);
|
||||
inputVal4 = _mm_mul_ps(inputVal4, invScalar);
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
inputVal1 = _mm_mul_ps(inputVal1, invScalar);
|
||||
inputVal2 = _mm_mul_ps(inputVal2, invScalar);
|
||||
inputVal3 = _mm_mul_ps(inputVal3, invScalar);
|
||||
inputVal4 = _mm_mul_ps(inputVal4, invScalar);
|
||||
// Clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
|
||||
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
float scaled = 0;
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++){
|
||||
scaled = inputVectorPtr[i]/scalar;
|
||||
if(scaled > max_val)
|
||||
scaled = max_val;
|
||||
else if(scaled < min_val)
|
||||
scaled = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(scaled);
|
||||
}
|
||||
for(unsigned int i = 0; i < (num_points%4)*4; i++)
|
||||
{
|
||||
scaled = inputVectorPtr[i]/scalar;
|
||||
if(scaled > max_val)
|
||||
scaled = max_val;
|
||||
else if(scaled < min_val)
|
||||
scaled = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(scaled);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -211,21 +219,23 @@ static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVec
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points)
|
||||
{
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
float scaled = 0;
|
||||
float min_val = -128;
|
||||
float max_val = 127;
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++){
|
||||
scaled = inputVectorPtr[i]/scalar;
|
||||
if(scaled > max_val)
|
||||
scaled = max_val;
|
||||
else if(scaled < min_val)
|
||||
scaled = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(scaled);
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < num_points*2; i++)
|
||||
{
|
||||
scaled = inputVectorPtr[i]/scalar;
|
||||
if(scaled > max_val)
|
||||
scaled = max_val;
|
||||
else if(scaled < min_val)
|
||||
scaled = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(scaled);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc
|
||||
* \brief Volk protokernel: replaces the tracking function for update_local_code
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that replaces the tracking function for update_local_code
|
||||
@ -19,7 +19,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -43,107 +43,89 @@
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
/*!
|
||||
/*!
|
||||
\brief Takes the conjugate of a complex vector.
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
|
||||
|
||||
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
|
||||
// *pointer1 = 1;
|
||||
// float* pointer2 = (float*)&code_length_half_chips;
|
||||
// *pointer2 = 6;
|
||||
// float* pointer3 = (float*)&code_phase_step_half_chips;
|
||||
// *pointer3 = 7;
|
||||
// float* pointer4 = (float*)&tcode_half_chips_input;
|
||||
// *pointer4 = 8;
|
||||
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
|
||||
|
||||
__m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
|
||||
|
||||
|
||||
__m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
|
||||
__m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
|
||||
__m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
|
||||
__m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
|
||||
__m128 twos = _mm_set1_ps (2);
|
||||
__m128i associated_chip_index_array_int;
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int32_t output[4];
|
||||
|
||||
for (unsigned int i = 0; i < sse_iters; i++)
|
||||
{
|
||||
//fmod = numer - tquot * denom; tquot = numer/denom truncated
|
||||
//associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
|
||||
fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
|
||||
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
|
||||
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
|
||||
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
|
||||
|
||||
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
|
||||
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
|
||||
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
|
||||
_mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int);
|
||||
|
||||
//d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
*d_very_early_code++ = d_ca_code[output[0]];
|
||||
*d_very_early_code++ = d_ca_code[output[1]];
|
||||
*d_very_early_code++ = d_ca_code[output[2]];
|
||||
*d_very_early_code++ = d_ca_code[output[3]];
|
||||
|
||||
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
|
||||
}
|
||||
|
||||
if (num_points%4!=0)
|
||||
{
|
||||
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
|
||||
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
|
||||
|
||||
int associated_chip_index;
|
||||
float tcode_half_chips = tcode_half_chips_stored[0];
|
||||
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
|
||||
|
||||
for (unsigned int i = 0; i < num_points%4; i++)
|
||||
__VOLK_ATTR_ALIGNED(16) int32_t output[4];
|
||||
|
||||
for (unsigned int i = 0; i < sse_iters; i++)
|
||||
{
|
||||
//fmod = numer - tquot * denom; tquot = numer/denom truncated
|
||||
//associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
|
||||
fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
|
||||
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
|
||||
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
|
||||
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
|
||||
|
||||
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
|
||||
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
|
||||
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
|
||||
_mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int);
|
||||
|
||||
//d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
*d_very_early_code++ = d_ca_code[output[0]];
|
||||
*d_very_early_code++ = d_ca_code[output[1]];
|
||||
*d_very_early_code++ = d_ca_code[output[2]];
|
||||
*d_very_early_code++ = d_ca_code[output[3]];
|
||||
|
||||
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
|
||||
}
|
||||
|
||||
if (num_points % 4 != 0)
|
||||
{
|
||||
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
|
||||
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
|
||||
|
||||
int associated_chip_index;
|
||||
float tcode_half_chips = tcode_half_chips_stored[0];
|
||||
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
|
||||
|
||||
for (unsigned int i = 0; i < num_points%4; i++)
|
||||
{
|
||||
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
|
||||
d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Takes the conjugate of a complex vector.
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points)
|
||||
{
|
||||
int associated_chip_index;
|
||||
float tcode_half_chips = tcode_half_chips_input;
|
||||
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
|
||||
|
||||
for (unsigned int i = 0; i < num_points; i++)
|
||||
{
|
||||
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
|
||||
d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Takes the conjugate of a complex vector.
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
|
||||
|
||||
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
|
||||
// *pointer1 = 1;
|
||||
// float* pointer2 = (float*)&code_length_half_chips;
|
||||
// *pointer2 = 6;
|
||||
// float* pointer3 = (float*)&code_phase_step_half_chips;
|
||||
// *pointer3 = 7;
|
||||
// float* pointer4 = (float*)&tcode_half_chips_input;
|
||||
// *pointer4 = 8;
|
||||
|
||||
int associated_chip_index;
|
||||
float tcode_half_chips = tcode_half_chips_input;
|
||||
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
|
||||
|
||||
for (unsigned int i = 0; i < num_points; i++)
|
||||
{
|
||||
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
|
||||
d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -159,108 +141,90 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_3
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
/*!
|
||||
/*!
|
||||
\brief Takes the conjugate of a complex vector.
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
|
||||
|
||||
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
|
||||
// *pointer1 = 1;
|
||||
// float* pointer2 = (float*)&code_length_half_chips;
|
||||
// *pointer2 = 6;
|
||||
// float* pointer3 = (float*)&code_phase_step_half_chips;
|
||||
// *pointer3 = 7;
|
||||
// float* pointer4 = (float*)&tcode_half_chips_input;
|
||||
// *pointer4 = 8;
|
||||
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
|
||||
|
||||
__m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
|
||||
|
||||
|
||||
__m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
|
||||
__m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
|
||||
__m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
|
||||
__m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
|
||||
__m128 twos = _mm_set1_ps (2);
|
||||
__m128i associated_chip_index_array_int;
|
||||
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int32_t output[4];
|
||||
|
||||
|
||||
for (unsigned int i = 0; i < sse_iters; i++)
|
||||
{
|
||||
//fmod = numer - tquot * denom; tquot = numer/denom truncated
|
||||
//associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
|
||||
fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
|
||||
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
|
||||
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
|
||||
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
|
||||
|
||||
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
|
||||
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
|
||||
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
|
||||
_mm_store_si128 ((__m128i*)output, associated_chip_index_array_int);
|
||||
|
||||
//d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
*d_very_early_code++ = d_ca_code[output[0]];
|
||||
*d_very_early_code++ = d_ca_code[output[1]];
|
||||
*d_very_early_code++ = d_ca_code[output[2]];
|
||||
*d_very_early_code++ = d_ca_code[output[3]];
|
||||
|
||||
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
|
||||
}
|
||||
|
||||
if (num_points%4!=0)
|
||||
{
|
||||
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
|
||||
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
|
||||
|
||||
int associated_chip_index;
|
||||
float tcode_half_chips = tcode_half_chips_stored[0];
|
||||
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
|
||||
|
||||
for (unsigned int i = 0; i < num_points%4; i++)
|
||||
{
|
||||
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
|
||||
d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
//fmod = numer - tquot * denom; tquot = numer/denom truncated
|
||||
//associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
|
||||
fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
|
||||
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
|
||||
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
|
||||
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
|
||||
|
||||
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
|
||||
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
|
||||
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
|
||||
_mm_store_si128 ((__m128i*)output, associated_chip_index_array_int);
|
||||
|
||||
//d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
*d_very_early_code++ = d_ca_code[output[0]];
|
||||
*d_very_early_code++ = d_ca_code[output[1]];
|
||||
*d_very_early_code++ = d_ca_code[output[2]];
|
||||
*d_very_early_code++ = d_ca_code[output[3]];
|
||||
|
||||
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
|
||||
}
|
||||
|
||||
if (num_points % 4 != 0)
|
||||
{
|
||||
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
|
||||
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
|
||||
|
||||
int associated_chip_index;
|
||||
float tcode_half_chips = tcode_half_chips_stored[0];
|
||||
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
|
||||
|
||||
for (unsigned int i = 0; i < num_points%4; i++)
|
||||
{
|
||||
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
|
||||
d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
/*!
|
||||
\brief Takes the conjugate of a complex vector.
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
|
||||
|
||||
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
|
||||
// *pointer1 = 1;
|
||||
// float* pointer2 = (float*)&code_length_half_chips;
|
||||
// *pointer2 = 6;
|
||||
// float* pointer3 = (float*)&code_phase_step_half_chips;
|
||||
// *pointer3 = 7;
|
||||
// float* pointer4 = (float*)&tcode_half_chips_input;
|
||||
// *pointer4 = 8;
|
||||
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points)
|
||||
{
|
||||
int associated_chip_index;
|
||||
float tcode_half_chips = tcode_half_chips_input;
|
||||
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
|
||||
|
||||
|
||||
for (unsigned int i = 0; i < num_points; i++)
|
||||
{
|
||||
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
|
||||
d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
}
|
||||
{
|
||||
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
|
||||
d_very_early_code[i] = d_ca_code[associated_chip_index];
|
||||
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
@ -2,8 +2,8 @@
|
||||
* \file volk_gnsssdr_32fc_x5_cw_vepl_corr_32fc_x5
|
||||
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt and Late correlation with 64 bits vectors
|
||||
* \authors <ul>
|
||||
* <li>Javier Arribas, 2011. jarribas(at)cttc.es
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Javier Arribas, 2011. jarribas(at)cttc.es
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that performs the carrier wipe-off mixing and the
|
||||
@ -31,7 +31,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -60,7 +60,7 @@
|
||||
#include <pmmintrin.h>
|
||||
|
||||
|
||||
/*!
|
||||
/*!
|
||||
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
|
||||
\param input The input signal input
|
||||
\param carrier The carrier signal input
|
||||
@ -71,10 +71,10 @@
|
||||
\param P_out Early correlation output
|
||||
\param L_out Early correlation output
|
||||
\param num_points The number of complex values in vectors
|
||||
*/
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
unsigned int number = 0;
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_points / 2;
|
||||
|
||||
lv_32fc_t dotProduct_E;
|
||||
@ -100,90 +100,90 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_
|
||||
const lv_32fc_t* _L_code = L_code;
|
||||
|
||||
for(;number < halfPoints; number++)
|
||||
{
|
||||
// carrier wipe-off (vector point-to-point product)
|
||||
x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
{
|
||||
// carrier wipe-off (vector point-to-point product)
|
||||
x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
//_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
|
||||
//_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
|
||||
|
||||
// correlation E,P,L (3x vector scalar product)
|
||||
// Early
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
x = z;
|
||||
// correlation E,P,L (3x vector scalar product)
|
||||
// Early
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
x = z;
|
||||
|
||||
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
|
||||
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
|
||||
|
||||
// Prompt
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
// Prompt
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
|
||||
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
|
||||
|
||||
// Late
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
// Late
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
|
||||
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
|
||||
|
||||
/*pointer increment*/
|
||||
_carrier += 2;
|
||||
_input += 2;
|
||||
//_input_BB += 2;
|
||||
_E_code += 2;
|
||||
_P_code += 2;
|
||||
_L_code +=2;
|
||||
}
|
||||
/*pointer increment*/
|
||||
_carrier += 2;
|
||||
_input += 2;
|
||||
//_input_BB += 2;
|
||||
_E_code += 2;
|
||||
_P_code += 2;
|
||||
_L_code +=2;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
|
||||
@ -199,12 +199,12 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_
|
||||
dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
|
||||
|
||||
if((num_points % 2) != 0)
|
||||
{
|
||||
//_input_BB = (*_input) * (*_carrier);
|
||||
dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
|
||||
dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
|
||||
dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
|
||||
}
|
||||
{
|
||||
//_input_BB = (*_input) * (*_carrier);
|
||||
dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
|
||||
dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
|
||||
dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
|
||||
}
|
||||
|
||||
*E_out = dotProduct_E;
|
||||
*P_out = dotProduct_P;
|
||||
@ -229,22 +229,22 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_
|
||||
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t bb_signal_sample;
|
||||
|
||||
|
||||
bb_signal_sample = lv_cmake(0, 0);
|
||||
|
||||
|
||||
*E_out = 0;
|
||||
*P_out = 0;
|
||||
*L_out = 0;
|
||||
// perform Early, Prompt and Late correlation
|
||||
for(unsigned int i=0; i < num_points; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += bb_signal_sample * E_code[i];
|
||||
*P_out += bb_signal_sample * P_code[i];
|
||||
*L_out += bb_signal_sample * L_code[i];
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += bb_signal_sample * E_code[i];
|
||||
*P_out += bb_signal_sample * P_code[i];
|
||||
*L_out += bb_signal_sample * L_code[i];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
@ -277,23 +277,23 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
unsigned int number = 0;
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_points / 2;
|
||||
|
||||
|
||||
lv_32fc_t dotProduct_E;
|
||||
memset(&dotProduct_E, 0x0, 2*sizeof(float));
|
||||
lv_32fc_t dotProduct_P;
|
||||
memset(&dotProduct_P, 0x0, 2*sizeof(float));
|
||||
lv_32fc_t dotProduct_L;
|
||||
memset(&dotProduct_L, 0x0, 2*sizeof(float));
|
||||
|
||||
|
||||
// Aux vars
|
||||
__m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L;
|
||||
|
||||
|
||||
z_E = _mm_setzero_ps();
|
||||
z_P = _mm_setzero_ps();
|
||||
z_L = _mm_setzero_ps();
|
||||
|
||||
|
||||
//input and output vectors
|
||||
//lv_32fc_t* _input_BB = input_BB;
|
||||
const lv_32fc_t* _input = input;
|
||||
@ -301,114 +301,114 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_
|
||||
const lv_32fc_t* _E_code = E_code;
|
||||
const lv_32fc_t* _P_code = P_code;
|
||||
const lv_32fc_t* _L_code = L_code;
|
||||
|
||||
|
||||
for(;number < halfPoints; number++)
|
||||
{
|
||||
// carrier wipe-off (vector point-to-point product)
|
||||
x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
//_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
|
||||
|
||||
// correlation E,P,L (3x vector scalar product)
|
||||
// Early
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
x = z;
|
||||
|
||||
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
|
||||
|
||||
// Prompt
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
|
||||
|
||||
// Late
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
|
||||
|
||||
/*pointer increment*/
|
||||
_carrier += 2;
|
||||
_input += 2;
|
||||
//_input_BB += 2;
|
||||
_E_code += 2;
|
||||
_P_code += 2;
|
||||
_L_code +=2;
|
||||
}
|
||||
|
||||
{
|
||||
// carrier wipe-off (vector point-to-point product)
|
||||
x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
//_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
|
||||
|
||||
// correlation E,P,L (3x vector scalar product)
|
||||
// Early
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
x = z;
|
||||
|
||||
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
|
||||
|
||||
// Prompt
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
|
||||
|
||||
// Late
|
||||
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
|
||||
|
||||
/*pointer increment*/
|
||||
_carrier += 2;
|
||||
_input += 2;
|
||||
//_input_BB += 2;
|
||||
_E_code += 2;
|
||||
_P_code += 2;
|
||||
_L_code +=2;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
|
||||
//__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB;
|
||||
|
||||
|
||||
_mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
|
||||
|
||||
|
||||
dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] );
|
||||
dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] );
|
||||
dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
|
||||
|
||||
|
||||
if((num_points % 2) != 0)
|
||||
{
|
||||
//_input_BB = (*_input) * (*_carrier);
|
||||
dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
|
||||
dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
|
||||
dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
|
||||
}
|
||||
|
||||
{
|
||||
//_input_BB = (*_input) * (*_carrier);
|
||||
dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
|
||||
dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
|
||||
dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
|
||||
}
|
||||
|
||||
*E_out = dotProduct_E;
|
||||
*P_out = dotProduct_P;
|
||||
*L_out = dotProduct_L;
|
||||
@ -432,22 +432,22 @@ static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_
|
||||
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t bb_signal_sample;
|
||||
|
||||
|
||||
bb_signal_sample = lv_cmake(0, 0);
|
||||
|
||||
|
||||
*E_out = 0;
|
||||
*P_out = 0;
|
||||
*L_out = 0;
|
||||
// perform Early, Prompt and Late correlation
|
||||
for(unsigned int i=0; i < num_points; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += bb_signal_sample * E_code[i];
|
||||
*P_out += bb_signal_sample * P_code[i];
|
||||
*L_out += bb_signal_sample * L_code[i];
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += bb_signal_sample * E_code[i];
|
||||
*P_out += bb_signal_sample * P_code[i];
|
||||
*L_out += bb_signal_sample * L_code[i];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_64f_accumulator_64f.h
|
||||
* \brief Volk protokernel: 64 bits (double) scalar accumulator
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that implements an accumulator of char values
|
||||
@ -19,7 +19,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8i_accumulator_s8i.h
|
||||
* \brief Volk protokernel: 8 bits (char) scalar accumulator
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that implements an accumulator of char values
|
||||
@ -19,7 +19,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -47,31 +47,35 @@
|
||||
\param inputBuffer The buffer of data to be accumulated
|
||||
\param num_points The number of values in inputBuffer to be accumulated
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points)
|
||||
{
|
||||
char returnValue = 0;
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
const char* aPtr = inputBuffer;
|
||||
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
|
||||
__m128i accumulator = _mm_setzero_si128();
|
||||
__m128i aVal = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++){
|
||||
aVal = _mm_lddqu_si128((__m128i*)aPtr);
|
||||
accumulator = _mm_add_epi8(accumulator, aVal);
|
||||
aPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
aVal = _mm_lddqu_si128((__m128i*)aPtr);
|
||||
accumulator = _mm_add_epi8(accumulator, aVal);
|
||||
aPtr += 16;
|
||||
}
|
||||
_mm_storeu_si128((__m128i*)tempBuffer,accumulator);
|
||||
|
||||
for(unsigned int i = 0; i<16; ++i){
|
||||
returnValue += tempBuffer[i];
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i){
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int i = 0; i<16; ++i)
|
||||
{
|
||||
returnValue += tempBuffer[i];
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
|
||||
*result = returnValue;
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
@ -83,13 +87,15 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
|
||||
\param inputBuffer The buffer of data to be accumulated
|
||||
\param num_points The number of values in inputBuffer to be accumulated
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points)
|
||||
{
|
||||
const char* aPtr = inputBuffer;
|
||||
char returnValue = 0;
|
||||
|
||||
for(unsigned int number = 0;number < num_points; number++){
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
|
||||
for(unsigned int number = 0;number < num_points; number++)
|
||||
{
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
*result = returnValue;
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
@ -112,31 +118,34 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const c
|
||||
\param inputBuffer The buffer of data to be accumulated
|
||||
\param num_points The number of values in inputBuffer to be accumulated
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points)
|
||||
{
|
||||
char returnValue = 0;
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
const char* aPtr = inputBuffer;
|
||||
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
|
||||
__m128i accumulator = _mm_setzero_si128();
|
||||
__m128i aVal = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++){
|
||||
aVal = _mm_load_si128((__m128i*)aPtr);
|
||||
accumulator = _mm_add_epi8(accumulator, aVal);
|
||||
aPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
aVal = _mm_load_si128((__m128i*)aPtr);
|
||||
accumulator = _mm_add_epi8(accumulator, aVal);
|
||||
aPtr += 16;
|
||||
}
|
||||
_mm_store_si128((__m128i*)tempBuffer,accumulator);
|
||||
|
||||
|
||||
for(unsigned int i = 0; i<16; ++i){
|
||||
returnValue += tempBuffer[i];
|
||||
returnValue += tempBuffer[i];
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i){
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
|
||||
*result = returnValue;
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
@ -148,13 +157,15 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
|
||||
\param inputBuffer The buffer of data to be accumulated
|
||||
\param num_points The number of values in inputBuffer to be accumulated
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points)
|
||||
{
|
||||
const char* aPtr = inputBuffer;
|
||||
char returnValue = 0;
|
||||
|
||||
for(unsigned int number = 0;number < num_points; number++){
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
|
||||
for(unsigned int number = 0;number < num_points; number++)
|
||||
{
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
*result = returnValue;
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
@ -167,14 +178,15 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const
|
||||
\param num_points The number of values in inputBuffer to be accumulated
|
||||
*/
|
||||
extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){
|
||||
|
||||
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points)
|
||||
{
|
||||
short res = 0;
|
||||
char* resc = (char*)&res;
|
||||
resc++;
|
||||
|
||||
|
||||
volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points);
|
||||
|
||||
|
||||
*result = *resc;
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8i_index_max_16u.h
|
||||
* \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars
|
||||
@ -19,7 +19,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8i_max_s8i.h
|
||||
* \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars
|
||||
@ -19,7 +19,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -47,44 +47,46 @@
|
||||
\param src0 The buffer of data to be analysed
|
||||
\param num_points The number of values in src0 to be analysed
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) {
|
||||
if(num_points > 0){
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
|
||||
__m128i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
|
||||
inputPtr += 16;
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
|
||||
__m128i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
|
||||
inputPtr += 16;
|
||||
}
|
||||
|
||||
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
|
||||
|
||||
for(unsigned int i = 0; i<16; ++i)
|
||||
{
|
||||
if(maxValuesBuffer[i] > max)
|
||||
{
|
||||
max = maxValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
|
||||
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
|
||||
|
||||
for(unsigned int i = 0; i<16; ++i)
|
||||
{
|
||||
if(maxValuesBuffer[i] > max)
|
||||
{
|
||||
max = maxValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE4_1*/
|
||||
@ -97,55 +99,57 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src
|
||||
\param src0 The buffer of data to be analysed
|
||||
\param num_points The number of values in src0 to be analysed
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) {
|
||||
if(num_points > 0){
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
unsigned short mask;
|
||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
||||
__m128i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||
mask = _mm_movemask_epi8(compareResults);
|
||||
|
||||
if (mask != 0xFFFF)
|
||||
{
|
||||
_mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues);
|
||||
mask = ~mask;
|
||||
int i = 0;
|
||||
while (mask > 0)
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
unsigned short mask;
|
||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
||||
__m128i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
if ((mask & 1) == 1)
|
||||
{
|
||||
if(currentValuesBuffer[i] > max)
|
||||
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||
mask = _mm_movemask_epi8(compareResults);
|
||||
|
||||
if (mask != 0xFFFF)
|
||||
{
|
||||
max = currentValuesBuffer[i];
|
||||
_mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues);
|
||||
mask = ~mask;
|
||||
int i = 0;
|
||||
while (mask > 0)
|
||||
{
|
||||
if ((mask & 1) == 1)
|
||||
{
|
||||
if(currentValuesBuffer[i] > max)
|
||||
{
|
||||
max = currentValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
i++;
|
||||
mask >>= 1;
|
||||
}
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
}
|
||||
}
|
||||
i++;
|
||||
mask >>= 1;
|
||||
inputPtr += 16;
|
||||
}
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
}
|
||||
inputPtr += 16;
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE2*/
|
||||
@ -157,20 +161,21 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0,
|
||||
\param src0 The buffer of data to be analysed
|
||||
\param num_points The number of values in src0 to be analysed
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) {
|
||||
static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
char max = src0[0];
|
||||
|
||||
for(unsigned int i = 1; i < num_points; ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
char max = src0[0];
|
||||
|
||||
for(unsigned int i = 1; i < num_points; ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
@ -193,44 +198,46 @@ static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0
|
||||
\param src0 The buffer of data to be analysed
|
||||
\param num_points The number of values in src0 to be analysed
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) {
|
||||
if(num_points > 0){
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
|
||||
__m128i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
|
||||
inputPtr += 16;
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
|
||||
__m128i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
|
||||
inputPtr += 16;
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
|
||||
|
||||
for(unsigned int i = 0; i<16; ++i)
|
||||
{
|
||||
if(maxValuesBuffer[i] > max)
|
||||
{
|
||||
max = maxValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
|
||||
|
||||
for(unsigned int i = 0; i<16; ++i)
|
||||
{
|
||||
if(maxValuesBuffer[i] > max)
|
||||
{
|
||||
max = maxValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE4_1*/
|
||||
@ -243,55 +250,57 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src
|
||||
\param src0 The buffer of data to be analysed
|
||||
\param num_points The number of values in src0 to be analysed
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) {
|
||||
if(num_points > 0){
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
unsigned short mask;
|
||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
||||
__m128i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||
mask = _mm_movemask_epi8(compareResults);
|
||||
|
||||
if (mask != 0xFFFF)
|
||||
{
|
||||
_mm_store_si128((__m128i*)¤tValuesBuffer, currentValues);
|
||||
mask = ~mask;
|
||||
int i = 0;
|
||||
while (mask > 0)
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
unsigned short mask;
|
||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
||||
__m128i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
if ((mask & 1) == 1)
|
||||
{
|
||||
if(currentValuesBuffer[i] > max)
|
||||
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||
mask = _mm_movemask_epi8(compareResults);
|
||||
|
||||
if (mask != 0xFFFF)
|
||||
{
|
||||
max = currentValuesBuffer[i];
|
||||
_mm_store_si128((__m128i*)¤tValuesBuffer, currentValues);
|
||||
mask = ~mask;
|
||||
int i = 0;
|
||||
while (mask > 0)
|
||||
{
|
||||
if ((mask & 1) == 1)
|
||||
{
|
||||
if(currentValuesBuffer[i] > max)
|
||||
{
|
||||
max = currentValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
i++;
|
||||
mask >>= 1;
|
||||
}
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
}
|
||||
}
|
||||
i++;
|
||||
mask >>= 1;
|
||||
inputPtr += 16;
|
||||
}
|
||||
maxValues = _mm_set1_epi8(max);
|
||||
}
|
||||
inputPtr += 16;
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE2*/
|
||||
@ -303,23 +312,23 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0,
|
||||
\param src0 The buffer of data to be analysed
|
||||
\param num_points The number of values in src0 to be analysed
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) {
|
||||
static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
char max = src0[0];
|
||||
|
||||
for(unsigned int i = 1; i < num_points; ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
if(num_points > 0)
|
||||
{
|
||||
max = src0[i];
|
||||
char max = src0[0];
|
||||
for(unsigned int i = 1; i < num_points; ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
}
|
||||
target = max;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8i_x2_add_8i.h
|
||||
* \brief Volk protokernel: adds pairs of 8 bits (char) scalars
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that adds pairs of 8 bits (char) scalars
|
||||
@ -19,7 +19,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -47,34 +47,34 @@
|
||||
\param bVector One of the vectors to be added
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
char* cPtr = cVector;
|
||||
const char* aPtr = aVector;
|
||||
const char* bPtr= bVector;
|
||||
|
||||
|
||||
__m128i aVal, bVal, cVal;
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++){
|
||||
|
||||
aVal = _mm_loadu_si128((__m128i*)aPtr);
|
||||
bVal = _mm_loadu_si128((__m128i*)bPtr);
|
||||
|
||||
cVal = _mm_add_epi8(aVal, bVal);
|
||||
|
||||
_mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
|
||||
|
||||
aPtr += 16;
|
||||
bPtr += 16;
|
||||
cPtr += 16;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
aVal = _mm_loadu_si128((__m128i*)aPtr);
|
||||
bVal = _mm_loadu_si128((__m128i*)bPtr);
|
||||
|
||||
cVal = _mm_add_epi8(aVal, bVal);
|
||||
|
||||
_mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
|
||||
|
||||
aPtr += 16;
|
||||
bPtr += 16;
|
||||
cPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
{
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -86,15 +86,17 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
|
||||
\param bVector One of the vectors to be added
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
|
||||
{
|
||||
char* cPtr = cVector;
|
||||
const char* aPtr = aVector;
|
||||
const char* bPtr= bVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
|
||||
for(number = 0; number < num_points; number++)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -116,34 +118,34 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char*
|
||||
\param bVector One of the vectors to be added
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
char* cPtr = cVector;
|
||||
const char* aPtr = aVector;
|
||||
const char* bPtr= bVector;
|
||||
|
||||
|
||||
__m128i aVal, bVal, cVal;
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++){
|
||||
|
||||
aVal = _mm_load_si128((__m128i*)aPtr);
|
||||
bVal = _mm_load_si128((__m128i*)bPtr);
|
||||
|
||||
cVal = _mm_add_epi8(aVal, bVal);
|
||||
|
||||
_mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
|
||||
|
||||
aPtr += 16;
|
||||
bPtr += 16;
|
||||
cPtr += 16;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
aVal = _mm_load_si128((__m128i*)aPtr);
|
||||
bVal = _mm_load_si128((__m128i*)bPtr);
|
||||
|
||||
cVal = _mm_add_epi8(aVal, bVal);
|
||||
|
||||
_mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
|
||||
|
||||
aPtr += 16;
|
||||
bPtr += 16;
|
||||
cPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
{
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -155,15 +157,17 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
|
||||
\param bVector One of the vectors to be added
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
|
||||
{
|
||||
char* cPtr = cVector;
|
||||
const char* aPtr = aVector;
|
||||
const char* bPtr= bVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
|
||||
for(number = 0; number < num_points; number++)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -176,7 +180,8 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
|
||||
{
|
||||
volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points);
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8ic_conjugate_8ic.h
|
||||
* \brief Volk protokernel: calculates the conjugate of a 16 bits vector
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that calculates the conjugate of a
|
||||
@ -20,7 +20,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -48,37 +48,38 @@
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
|
||||
|
||||
__m256 tmp;
|
||||
__m128i tmp128lo, tmp128hi;
|
||||
__m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
|
||||
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
|
||||
|
||||
|
||||
for (unsigned int i = 0; i < sse_iters; ++i)
|
||||
{
|
||||
tmp = _mm256_loadu_ps((float*)a);
|
||||
tmp = _mm256_xor_ps(tmp, conjugator1);
|
||||
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
|
||||
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
||||
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
|
||||
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
||||
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
||||
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
|
||||
_mm256_storeu_ps((float*)c, tmp);
|
||||
|
||||
a += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
{
|
||||
tmp = _mm256_loadu_ps((float*)a);
|
||||
tmp = _mm256_xor_ps(tmp, conjugator1);
|
||||
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
|
||||
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
||||
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
|
||||
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
||||
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
||||
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
|
||||
_mm256_storeu_ps((float*)c, tmp);
|
||||
|
||||
a += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_AVX */
|
||||
|
||||
@ -90,29 +91,30 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
__m128i tmp;
|
||||
|
||||
|
||||
__m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
|
||||
|
||||
|
||||
for (unsigned int i = 0; i < sse_iters; ++i)
|
||||
{
|
||||
tmp = _mm_lddqu_si128((__m128i*)a);
|
||||
tmp = _mm_sign_epi8(tmp, conjugator);
|
||||
_mm_storeu_si128((__m128i*)c, tmp);
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
{
|
||||
tmp = _mm_lddqu_si128((__m128i*)a);
|
||||
tmp = _mm_sign_epi8(tmp, conjugator);
|
||||
_mm_storeu_si128((__m128i*)c, tmp);
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
|
||||
}
|
||||
#endif /* LV_HAVE_SSSE3 */
|
||||
|
||||
@ -124,31 +126,32 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
__m128i tmp;
|
||||
|
||||
|
||||
__m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
|
||||
|
||||
|
||||
for (unsigned int i = 0; i < sse_iters; ++i)
|
||||
{
|
||||
tmp = _mm_lddqu_si128((__m128i*)a);
|
||||
tmp = _mm_xor_si128(tmp, conjugator1);
|
||||
tmp = _mm_add_epi8(tmp, conjugator2);
|
||||
_mm_storeu_si128((__m128i*)c, tmp);
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
{
|
||||
tmp = _mm_lddqu_si128((__m128i*)a);
|
||||
tmp = _mm_xor_si128(tmp, conjugator1);
|
||||
tmp = _mm_add_epi8(tmp, conjugator2);
|
||||
_mm_storeu_si128((__m128i*)c, tmp);
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
@ -159,14 +162,16 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t* cPtr = cVector;
|
||||
const lv_8sc_t* aPtr = aVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = lv_conj(*aPtr++);
|
||||
}
|
||||
|
||||
for(number = 0; number < num_points; number++)
|
||||
{
|
||||
*cPtr++ = lv_conj(*aPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -188,37 +193,38 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
|
||||
|
||||
__m256 tmp;
|
||||
__m128i tmp128lo, tmp128hi;
|
||||
__m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
|
||||
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
|
||||
|
||||
|
||||
for (unsigned int i = 0; i < sse_iters; ++i)
|
||||
{
|
||||
tmp = _mm256_load_ps((float*)a);
|
||||
tmp = _mm256_xor_ps(tmp, conjugator1);
|
||||
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
|
||||
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
||||
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
|
||||
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
||||
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
||||
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
|
||||
_mm256_store_ps((float*)c, tmp);
|
||||
|
||||
a += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
{
|
||||
tmp = _mm256_load_ps((float*)a);
|
||||
tmp = _mm256_xor_ps(tmp, conjugator1);
|
||||
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
|
||||
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
||||
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
|
||||
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
||||
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
||||
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
|
||||
_mm256_store_ps((float*)c, tmp);
|
||||
|
||||
a += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_AVX */
|
||||
|
||||
@ -230,29 +236,30 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
__m128i tmp;
|
||||
|
||||
|
||||
__m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
|
||||
|
||||
|
||||
for (unsigned int i = 0; i < sse_iters; ++i)
|
||||
{
|
||||
tmp = _mm_load_si128((__m128i*)a);
|
||||
tmp = _mm_sign_epi8(tmp, conjugator);
|
||||
_mm_store_si128((__m128i*)c, tmp);
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
{
|
||||
tmp = _mm_load_si128((__m128i*)a);
|
||||
tmp = _mm_sign_epi8(tmp, conjugator);
|
||||
_mm_store_si128((__m128i*)c, tmp);
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
|
||||
}
|
||||
#endif /* LV_HAVE_SSSE3 */
|
||||
|
||||
@ -264,31 +271,32 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, con
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
__m128i tmp;
|
||||
|
||||
|
||||
__m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
|
||||
|
||||
|
||||
for (unsigned int i = 0; i < sse_iters; ++i)
|
||||
{
|
||||
tmp = _mm_load_si128((__m128i*)a);
|
||||
tmp = _mm_xor_si128(tmp, conjugator1);
|
||||
tmp = _mm_add_epi8(tmp, conjugator2);
|
||||
_mm_store_si128((__m128i*)c, tmp);
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
{
|
||||
tmp = _mm_load_si128((__m128i*)a);
|
||||
tmp = _mm_xor_si128(tmp, conjugator1);
|
||||
tmp = _mm_add_epi8(tmp, conjugator2);
|
||||
_mm_store_si128((__m128i*)c, tmp);
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
@ -299,14 +307,16 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t* cPtr = cVector;
|
||||
const lv_8sc_t* aPtr = aVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = lv_conj(*aPtr++);
|
||||
}
|
||||
|
||||
for(number = 0; number < num_points; number++)
|
||||
{
|
||||
*cPtr++ = lv_conj(*aPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -318,7 +328,8 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, c
|
||||
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points);
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8ic_magnitude_squared_8i.h
|
||||
* \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that calculates the magnitude squared of a
|
||||
@ -21,7 +21,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -49,56 +49,54 @@
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
const char* complexVectorPtr = (char*)complexVector;
|
||||
char* magnitudeVectorPtr = magnitudeVector;
|
||||
|
||||
|
||||
__m128i zero, result8;
|
||||
__m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
|
||||
__m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
|
||||
|
||||
|
||||
zero = _mm_setzero_si128();
|
||||
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
|
||||
avectorlo = _mm_unpacklo_epi8 (avector, zero);
|
||||
avectorhi = _mm_unpackhi_epi8 (avector, zero);
|
||||
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
|
||||
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
|
||||
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
|
||||
|
||||
complexVectorPtr += 16;
|
||||
|
||||
bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
|
||||
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
|
||||
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
|
||||
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
|
||||
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
|
||||
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
|
||||
|
||||
complexVectorPtr += 16;
|
||||
|
||||
result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
|
||||
|
||||
_mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8);
|
||||
|
||||
magnitudeVectorPtr += 16;
|
||||
|
||||
|
||||
}
|
||||
|
||||
{
|
||||
avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
|
||||
avectorlo = _mm_unpacklo_epi8 (avector, zero);
|
||||
avectorhi = _mm_unpackhi_epi8 (avector, zero);
|
||||
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
|
||||
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
|
||||
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
|
||||
|
||||
complexVectorPtr += 16;
|
||||
|
||||
bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
|
||||
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
|
||||
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
|
||||
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
|
||||
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
|
||||
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
|
||||
|
||||
complexVectorPtr += 16;
|
||||
|
||||
result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
|
||||
|
||||
_mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8);
|
||||
|
||||
magnitudeVectorPtr += 16;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
const char valReal = *complexVectorPtr++;
|
||||
const char valImag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
|
||||
}
|
||||
{
|
||||
const char valReal = *complexVectorPtr++;
|
||||
const char valImag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSSE3 */
|
||||
|
||||
@ -155,15 +153,17 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
|
||||
{
|
||||
const char* complexVectorPtr = (char*)complexVector;
|
||||
char* magnitudeVectorPtr = magnitudeVector;
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++){
|
||||
const char real = *complexVectorPtr++;
|
||||
const char imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
|
||||
}
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++)
|
||||
{
|
||||
const char real = *complexVectorPtr++;
|
||||
const char imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -185,56 +185,54 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitude
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
const char* complexVectorPtr = (char*)complexVector;
|
||||
char* magnitudeVectorPtr = magnitudeVector;
|
||||
|
||||
|
||||
__m128i zero, result8;
|
||||
__m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
|
||||
__m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
|
||||
|
||||
|
||||
zero = _mm_setzero_si128();
|
||||
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
avector = _mm_load_si128((__m128i*)complexVectorPtr);
|
||||
avectorlo = _mm_unpacklo_epi8 (avector, zero);
|
||||
avectorhi = _mm_unpackhi_epi8 (avector, zero);
|
||||
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
|
||||
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
|
||||
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
|
||||
|
||||
complexVectorPtr += 16;
|
||||
|
||||
bvector = _mm_load_si128((__m128i*)complexVectorPtr);
|
||||
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
|
||||
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
|
||||
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
|
||||
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
|
||||
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
|
||||
|
||||
complexVectorPtr += 16;
|
||||
|
||||
result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
|
||||
|
||||
_mm_store_si128((__m128i*)magnitudeVectorPtr, result8);
|
||||
|
||||
magnitudeVectorPtr += 16;
|
||||
|
||||
|
||||
}
|
||||
|
||||
{
|
||||
avector = _mm_load_si128((__m128i*)complexVectorPtr);
|
||||
avectorlo = _mm_unpacklo_epi8 (avector, zero);
|
||||
avectorhi = _mm_unpackhi_epi8 (avector, zero);
|
||||
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
|
||||
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
|
||||
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
|
||||
|
||||
complexVectorPtr += 16;
|
||||
|
||||
bvector = _mm_load_si128((__m128i*)complexVectorPtr);
|
||||
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
|
||||
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
|
||||
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
|
||||
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
|
||||
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
|
||||
|
||||
complexVectorPtr += 16;
|
||||
|
||||
result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
|
||||
|
||||
_mm_store_si128((__m128i*)magnitudeVectorPtr, result8);
|
||||
|
||||
magnitudeVectorPtr += 16;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
const char valReal = *complexVectorPtr++;
|
||||
const char valImag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
|
||||
}
|
||||
{
|
||||
const char valReal = *complexVectorPtr++;
|
||||
const char valImag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSSE3 */
|
||||
|
||||
@ -291,15 +289,17 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
|
||||
{
|
||||
const char* complexVectorPtr = (char*)complexVector;
|
||||
char* magnitudeVectorPtr = magnitudeVector;
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++){
|
||||
const char real = *complexVectorPtr++;
|
||||
const char imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
|
||||
}
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++)
|
||||
{
|
||||
const char real = *complexVectorPtr++;
|
||||
const char imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -311,7 +311,8 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitu
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
|
||||
{
|
||||
volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points);
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h
|
||||
* \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that multiplies a group of 16 bits vectors
|
||||
@ -20,7 +20,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -50,54 +50,54 @@
|
||||
\param scalar The complex scalar to multiply aVector
|
||||
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
|
||||
|
||||
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
y = _mm_set1_epi16 (*(short*)&scalar);
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
x = _mm_lddqu_si128((__m128i*)a);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
realc = _mm_and_si128 (realc, mult1);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_and_si128 (imagc, mult1);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realc, imagc);
|
||||
|
||||
_mm_storeu_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_lddqu_si128((__m128i*)a);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
realc = _mm_and_si128 (realc, mult1);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_and_si128 (imagc, mult1);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realc, imagc);
|
||||
|
||||
_mm_storeu_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = (*a++) * scalar;
|
||||
}
|
||||
|
||||
{
|
||||
*c++ = (*a++) * scalar;
|
||||
}
|
||||
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
@ -109,33 +109,34 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
|
||||
\param scalar The complex scalar to multiply aVector
|
||||
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
|
||||
{
|
||||
/*lv_8sc_t* cPtr = cVector;
|
||||
const lv_8sc_t* aPtr = aVector;
|
||||
|
||||
|
||||
for (int i = 0; i<num_points; ++i)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
}*/
|
||||
|
||||
|
||||
lv_8sc_t* cPtr = cVector;
|
||||
const lv_8sc_t* aPtr = aVector;
|
||||
unsigned int number = num_points;
|
||||
|
||||
|
||||
// unwrap loop
|
||||
while (number >= 8){
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
number -= 8;
|
||||
}
|
||||
|
||||
while (number >= 8)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
number -= 8;
|
||||
}
|
||||
|
||||
// clean up any remaining
|
||||
while (number-- > 0)
|
||||
*cPtr++ = *aPtr++ * scalar;
|
||||
@ -162,54 +163,54 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector,
|
||||
\param scalar The complex scalar to multiply aVector
|
||||
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
|
||||
|
||||
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
y = _mm_set1_epi16 (*(short*)&scalar);
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
realc = _mm_and_si128 (realc, mult1);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_and_si128 (imagc, mult1);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realc, imagc);
|
||||
|
||||
_mm_store_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
realc = _mm_and_si128 (realc, mult1);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_and_si128 (imagc, mult1);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realc, imagc);
|
||||
|
||||
_mm_store_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = (*a++) * scalar;
|
||||
}
|
||||
|
||||
{
|
||||
*c++ = (*a++) * scalar;
|
||||
}
|
||||
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
@ -221,33 +222,33 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
|
||||
\param scalar The complex scalar to multiply aVector
|
||||
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
|
||||
{
|
||||
/*lv_8sc_t* cPtr = cVector;
|
||||
const lv_8sc_t* aPtr = aVector;
|
||||
|
||||
|
||||
for (int i = 0; i<num_points; ++i)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
}*/
|
||||
|
||||
|
||||
lv_8sc_t* cPtr = cVector;
|
||||
const lv_8sc_t* aPtr = aVector;
|
||||
unsigned int number = num_points;
|
||||
|
||||
|
||||
// unwrap loop
|
||||
while (number >= 8){
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
number -= 8;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
number -= 8;
|
||||
}
|
||||
|
||||
|
||||
// clean up any remaining
|
||||
while (number-- > 0)
|
||||
*cPtr++ = *aPtr++ * scalar;
|
||||
@ -263,7 +264,8 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVecto
|
||||
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
|
||||
*/
|
||||
extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points)
|
||||
{
|
||||
volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points);
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h
|
||||
* \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
|
||||
@ -20,7 +20,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -49,43 +49,45 @@
|
||||
\param bVector One of the vectors to be multiplied and accumulated
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
|
||||
{
|
||||
/*lv_8sc_t* cPtr = result;
|
||||
const lv_8sc_t* aPtr = input;
|
||||
const lv_8sc_t* bPtr = taps;
|
||||
|
||||
|
||||
for(int number = 0; number < num_points; number++){
|
||||
*cPtr += (*aPtr++) * (*bPtr++);
|
||||
}*/
|
||||
|
||||
|
||||
char * res = (char*) result;
|
||||
char * in = (char*) input;
|
||||
char * tp = (char*) taps;
|
||||
unsigned int n_2_ccomplex_blocks = num_points/2;
|
||||
unsigned int isodd = num_points & 1;
|
||||
|
||||
|
||||
char sum0[2] = {0,0};
|
||||
char sum1[2] = {0,0};
|
||||
unsigned int i = 0;
|
||||
|
||||
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
|
||||
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
|
||||
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
|
||||
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
|
||||
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
|
||||
|
||||
in += 4;
|
||||
tp += 4;
|
||||
}
|
||||
|
||||
|
||||
for(i = 0; i < n_2_ccomplex_blocks; ++i)
|
||||
{
|
||||
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
|
||||
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
|
||||
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
|
||||
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
|
||||
|
||||
in += 4;
|
||||
tp += 4;
|
||||
}
|
||||
|
||||
res[0] = sum0[0] + sum1[0];
|
||||
res[1] = sum0[1] + sum1[1];
|
||||
|
||||
|
||||
// Cleanup if we had an odd number of points
|
||||
for(i = 0; i < isodd; ++i) {
|
||||
*result += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
for(i = 0; i < isodd; ++i)
|
||||
{
|
||||
*result += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
@ -99,73 +101,73 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
|
||||
\param bVector One of the vectors to be multiplied and accumulated
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t dotProduct;
|
||||
memset(&dotProduct, 0x0, 2*sizeof(char));
|
||||
|
||||
|
||||
const lv_8sc_t* a = input;
|
||||
const lv_8sc_t* b = taps;
|
||||
|
||||
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
realcacc = _mm_setzero_si128();
|
||||
imagcacc = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++){
|
||||
|
||||
x = _mm_loadu_si128((__m128i*)a);
|
||||
y = _mm_loadu_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
|
||||
realcacc = _mm_add_epi16 (realcacc, realc);
|
||||
imagcacc = _mm_add_epi16 (imagcacc, imagc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
realcacc = _mm_and_si128 (realcacc, mult1);
|
||||
imagcacc = _mm_and_si128 (imagcacc, mult1);
|
||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realcacc, imagcacc);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
||||
|
||||
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<8; ++i)
|
||||
{
|
||||
dotProduct += dotProductVector[i];
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
realcacc = _mm_setzero_si128();
|
||||
imagcacc = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_loadu_si128((__m128i*)a);
|
||||
y = _mm_loadu_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
|
||||
realcacc = _mm_add_epi16 (realcacc, realc);
|
||||
imagcacc = _mm_add_epi16 (imagcacc, imagc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
realcacc = _mm_and_si128 (realcacc, mult1);
|
||||
imagcacc = _mm_and_si128 (imagcacc, mult1);
|
||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realcacc, imagcacc);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
||||
|
||||
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<8; ++i)
|
||||
{
|
||||
dotProduct += dotProductVector[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
dotProduct += (*a++) * (*b++);
|
||||
}
|
||||
|
||||
{
|
||||
dotProduct += (*a++) * (*b++);
|
||||
}
|
||||
|
||||
*result = dotProduct;
|
||||
}
|
||||
|
||||
@ -180,71 +182,71 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
|
||||
\param bVector One of the vectors to be multiplied and accumulated
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t dotProduct;
|
||||
memset(&dotProduct, 0x0, 2*sizeof(char));
|
||||
|
||||
|
||||
const lv_8sc_t* a = input;
|
||||
const lv_8sc_t* b = taps;
|
||||
|
||||
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
realcacc = _mm_setzero_si128();
|
||||
imagcacc = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++){
|
||||
|
||||
x = _mm_lddqu_si128((__m128i*)a);
|
||||
y = _mm_lddqu_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
|
||||
realcacc = _mm_add_epi16 (realcacc, realc);
|
||||
imagcacc = _mm_add_epi16 (imagcacc, imagc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
||||
|
||||
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
||||
|
||||
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<8; ++i)
|
||||
{
|
||||
dotProduct += dotProductVector[i];
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
realcacc = _mm_setzero_si128();
|
||||
imagcacc = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_lddqu_si128((__m128i*)a);
|
||||
y = _mm_lddqu_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
|
||||
realcacc = _mm_add_epi16 (realcacc, realc);
|
||||
imagcacc = _mm_add_epi16 (imagcacc, imagc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
||||
|
||||
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
||||
|
||||
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<8; ++i)
|
||||
{
|
||||
dotProduct += dotProductVector[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
dotProduct += (*a++) * (*b++);
|
||||
}
|
||||
|
||||
{
|
||||
dotProduct += (*a++) * (*b++);
|
||||
}
|
||||
|
||||
*result = dotProduct;
|
||||
}
|
||||
|
||||
@ -270,43 +272,45 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
|
||||
\param bVector One of the vectors to be multiplied and accumulated
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
|
||||
{
|
||||
/*lv_8sc_t* cPtr = result;
|
||||
const lv_8sc_t* aPtr = input;
|
||||
const lv_8sc_t* bPtr = taps;
|
||||
|
||||
|
||||
for(int number = 0; number < num_points; number++){
|
||||
*cPtr += (*aPtr++) * (*bPtr++);
|
||||
}*/
|
||||
|
||||
|
||||
char * res = (char*) result;
|
||||
char * in = (char*) input;
|
||||
char * tp = (char*) taps;
|
||||
unsigned int n_2_ccomplex_blocks = num_points/2;
|
||||
unsigned int isodd = num_points & 1;
|
||||
|
||||
|
||||
char sum0[2] = {0,0};
|
||||
char sum1[2] = {0,0};
|
||||
unsigned int i = 0;
|
||||
|
||||
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
|
||||
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
|
||||
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
|
||||
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
|
||||
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
|
||||
|
||||
in += 4;
|
||||
tp += 4;
|
||||
}
|
||||
|
||||
|
||||
for(i = 0; i < n_2_ccomplex_blocks; ++i)
|
||||
{
|
||||
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
|
||||
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
|
||||
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
|
||||
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
|
||||
|
||||
in += 4;
|
||||
tp += 4;
|
||||
}
|
||||
|
||||
res[0] = sum0[0] + sum1[0];
|
||||
res[1] = sum0[1] + sum1[1];
|
||||
|
||||
|
||||
// Cleanup if we had an odd number of points
|
||||
for(i = 0; i < isodd; ++i) {
|
||||
*result += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
for(i = 0; i < isodd; ++i)
|
||||
{
|
||||
*result += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
@ -320,73 +324,73 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result,
|
||||
\param bVector One of the vectors to be multiplied and accumulated
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t dotProduct;
|
||||
memset(&dotProduct, 0x0, 2*sizeof(char));
|
||||
|
||||
|
||||
const lv_8sc_t* a = input;
|
||||
const lv_8sc_t* b = taps;
|
||||
|
||||
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
realcacc = _mm_setzero_si128();
|
||||
imagcacc = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++){
|
||||
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
|
||||
realcacc = _mm_add_epi16 (realcacc, realc);
|
||||
imagcacc = _mm_add_epi16 (imagcacc, imagc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
realcacc = _mm_and_si128 (realcacc, mult1);
|
||||
imagcacc = _mm_and_si128 (imagcacc, mult1);
|
||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realcacc, imagcacc);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
||||
|
||||
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<8; ++i)
|
||||
{
|
||||
dotProduct += dotProductVector[i];
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
realcacc = _mm_setzero_si128();
|
||||
imagcacc = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
|
||||
realcacc = _mm_add_epi16 (realcacc, realc);
|
||||
imagcacc = _mm_add_epi16 (imagcacc, imagc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
realcacc = _mm_and_si128 (realcacc, mult1);
|
||||
imagcacc = _mm_and_si128 (imagcacc, mult1);
|
||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realcacc, imagcacc);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
||||
|
||||
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<8; ++i)
|
||||
{
|
||||
dotProduct += dotProductVector[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
dotProduct += (*a++) * (*b++);
|
||||
}
|
||||
|
||||
{
|
||||
dotProduct += (*a++) * (*b++);
|
||||
}
|
||||
|
||||
*result = dotProduct;
|
||||
}
|
||||
|
||||
@ -401,71 +405,71 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
|
||||
\param bVector One of the vectors to be multiplied and accumulated
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t dotProduct;
|
||||
memset(&dotProduct, 0x0, 2*sizeof(char));
|
||||
|
||||
|
||||
const lv_8sc_t* a = input;
|
||||
const lv_8sc_t* b = taps;
|
||||
|
||||
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
realcacc = _mm_setzero_si128();
|
||||
imagcacc = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++){
|
||||
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
|
||||
realcacc = _mm_add_epi16 (realcacc, realc);
|
||||
imagcacc = _mm_add_epi16 (imagcacc, imagc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
||||
|
||||
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
||||
|
||||
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<8; ++i)
|
||||
{
|
||||
dotProduct += dotProductVector[i];
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
realcacc = _mm_setzero_si128();
|
||||
imagcacc = _mm_setzero_si128();
|
||||
|
||||
for(unsigned int number = 0; number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
|
||||
realcacc = _mm_add_epi16 (realcacc, realc);
|
||||
imagcacc = _mm_add_epi16 (imagcacc, imagc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
||||
|
||||
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
||||
|
||||
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<8; ++i)
|
||||
{
|
||||
dotProduct += dotProductVector[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
dotProduct += (*a++) * (*b++);
|
||||
}
|
||||
|
||||
{
|
||||
dotProduct += (*a++) * (*b++);
|
||||
}
|
||||
|
||||
*result = dotProduct;
|
||||
}
|
||||
|
||||
@ -480,18 +484,18 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
|
||||
*/
|
||||
extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points)
|
||||
{
|
||||
short resReal = 0;
|
||||
char* resRealChar = (char*)&resReal;
|
||||
resRealChar++;
|
||||
|
||||
|
||||
short resImag = 0;
|
||||
char* resImagChar = (char*)&resImag;
|
||||
resImagChar++;
|
||||
|
||||
|
||||
volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points);
|
||||
|
||||
|
||||
*result = lv_cmake(*resRealChar, *resImagChar);
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8ic_x2_multiply_8ic.h
|
||||
* \brief Volk protokernel: multiplies two 16 bits vectors
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
|
||||
@ -20,7 +20,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -49,54 +49,54 @@
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
const lv_8sc_t* b = bVector;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
x = _mm_loadu_si128((__m128i*)a);
|
||||
y = _mm_loadu_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
realc = _mm_and_si128 (realc, mult1);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_and_si128 (imagc, mult1);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realc, imagc);
|
||||
|
||||
_mm_storeu_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_loadu_si128((__m128i*)a);
|
||||
y = _mm_loadu_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
realc = _mm_and_si128 (realc, mult1);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_and_si128 (imagc, mult1);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realc, imagc);
|
||||
|
||||
_mm_storeu_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -109,54 +109,54 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, zero;
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
const lv_8sc_t* b = bVector;
|
||||
|
||||
|
||||
zero = _mm_setzero_si128();
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
x = _mm_lddqu_si128((__m128i*)a);
|
||||
y = _mm_lddqu_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_blendv_epi8 (imagc, realc, mult1);
|
||||
|
||||
_mm_storeu_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_lddqu_si128((__m128i*)a);
|
||||
y = _mm_lddqu_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_blendv_epi8 (imagc, realc, mult1);
|
||||
|
||||
_mm_storeu_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
@ -168,14 +168,16 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t* cPtr = cVector;
|
||||
const lv_8sc_t* aPtr = aVector;
|
||||
const lv_8sc_t* bPtr = bVector;
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -198,54 +200,54 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
const lv_8sc_t* b = bVector;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
realc = _mm_and_si128 (realc, mult1);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_and_si128 (imagc, mult1);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realc, imagc);
|
||||
|
||||
_mm_store_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
realc = _mm_and_si128 (realc, mult1);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_and_si128 (imagc, mult1);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_or_si128 (realc, imagc);
|
||||
|
||||
_mm_store_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -258,54 +260,54 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, zero;
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
const lv_8sc_t* b = bVector;
|
||||
|
||||
|
||||
zero = _mm_setzero_si128();
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_blendv_epi8 (imagc, realc, mult1);
|
||||
|
||||
_mm_store_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
imagx = _mm_srli_si128 (x, 1);
|
||||
imagx = _mm_and_si128 (imagx, mult1);
|
||||
realx = _mm_and_si128 (x, mult1);
|
||||
|
||||
imagy = _mm_srli_si128 (y, 1);
|
||||
imagy = _mm_and_si128 (imagy, mult1);
|
||||
realy = _mm_and_si128 (y, mult1);
|
||||
|
||||
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
|
||||
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
|
||||
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
|
||||
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
|
||||
|
||||
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
|
||||
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
|
||||
imagc = _mm_slli_si128 (imagc, 1);
|
||||
|
||||
totalc = _mm_blendv_epi8 (imagc, realc, mult1);
|
||||
|
||||
_mm_store_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 8;
|
||||
b += 8;
|
||||
c += 8;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 8); ++i)
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
@ -317,15 +319,17 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t* cPtr = cVector;
|
||||
const lv_8sc_t* aPtr = aVector;
|
||||
const lv_8sc_t* bPtr = bVector;
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -338,7 +342,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector,
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
|
||||
{
|
||||
volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points);
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h
|
||||
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32.
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that performs the carrier wipe-off mixing and the
|
||||
@ -31,7 +31,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -57,6 +57,7 @@
|
||||
#include <smmintrin.h>
|
||||
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
|
||||
#include "CommonMacros/CommonMacros.h"
|
||||
|
||||
/*!
|
||||
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
|
||||
\param input The input signal input
|
||||
@ -72,101 +73,101 @@
|
||||
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
|
||||
|
||||
|
||||
__m128 E_code_acc, P_code_acc, L_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
|
||||
__m128 output_ps;
|
||||
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
|
||||
|
||||
const lv_8sc_t* E_code_ptr = E_code;
|
||||
lv_32fc_t* E_out_ptr = E_out;
|
||||
const lv_8sc_t* L_code_ptr = L_code;
|
||||
lv_32fc_t* L_out_ptr = L_out;
|
||||
const lv_8sc_t* P_code_ptr = P_code;
|
||||
lv_32fc_t* P_out_ptr = P_out;
|
||||
|
||||
|
||||
*E_out_ptr = 0;
|
||||
*P_out_ptr = 0;
|
||||
*L_out_ptr = 0;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
E_code_acc = _mm_setzero_ps();
|
||||
L_code_acc = _mm_setzero_ps();
|
||||
P_code_acc = _mm_setzero_ps();
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
for(int number = 0;number < sse_iters; number++){
|
||||
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_lddqu_si128((__m128i*)input_ptr);
|
||||
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
|
||||
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
|
||||
|
||||
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<2; ++i)
|
||||
{
|
||||
*E_out_ptr += E_dotProductVector[i];
|
||||
*P_out_ptr += P_dotProductVector[i];
|
||||
*L_out_ptr += L_dotProductVector[i];
|
||||
for(int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_lddqu_si128((__m128i*)input_ptr);
|
||||
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
|
||||
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
|
||||
|
||||
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<2; ++i)
|
||||
{
|
||||
*E_out_ptr += E_dotProductVector[i];
|
||||
*P_out_ptr += P_dotProductVector[i];
|
||||
*L_out_ptr += L_dotProductVector[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
lv_8sc_t bb_signal_sample;
|
||||
for(int i=0; i < num_points%8; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
@ -189,104 +190,104 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E
|
||||
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
|
||||
|
||||
|
||||
__m128 E_code_acc, P_code_acc, L_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32;
|
||||
__m128 output_ps_1, output_ps_2;
|
||||
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
|
||||
|
||||
const lv_8sc_t* E_code_ptr = E_code;
|
||||
lv_32fc_t* E_out_ptr = E_out;
|
||||
const lv_8sc_t* L_code_ptr = L_code;
|
||||
lv_32fc_t* L_out_ptr = L_out;
|
||||
const lv_8sc_t* P_code_ptr = P_code;
|
||||
lv_32fc_t* P_out_ptr = P_out;
|
||||
|
||||
|
||||
*E_out_ptr = 0;
|
||||
*P_out_ptr = 0;
|
||||
*L_out_ptr = 0;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
E_code_acc = _mm_setzero_ps();
|
||||
L_code_acc = _mm_setzero_ps();
|
||||
P_code_acc = _mm_setzero_ps();
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_loadu_si128((__m128i*)input_ptr);
|
||||
y = _mm_loadu_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
|
||||
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y = _mm_loadu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_loadu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
|
||||
|
||||
//Get late values
|
||||
y = _mm_loadu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
|
||||
|
||||
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<2; ++i)
|
||||
if (sse_iters>0)
|
||||
{
|
||||
*E_out_ptr += E_dotProductVector[i];
|
||||
*P_out_ptr += P_dotProductVector[i];
|
||||
*L_out_ptr += L_dotProductVector[i];
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_loadu_si128((__m128i*)input_ptr);
|
||||
y = _mm_loadu_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
|
||||
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y = _mm_loadu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_loadu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
|
||||
|
||||
//Get late values
|
||||
y = _mm_loadu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
|
||||
|
||||
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<2; ++i)
|
||||
{
|
||||
*E_out_ptr += E_dotProductVector[i];
|
||||
*P_out_ptr += P_dotProductVector[i];
|
||||
*L_out_ptr += L_dotProductVector[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
lv_8sc_t bb_signal_sample;
|
||||
for(unsigned int i=0; i < num_points%8; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -306,22 +307,22 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_o
|
||||
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t bb_signal_sample;
|
||||
|
||||
|
||||
bb_signal_sample = lv_cmake(0, 0);
|
||||
|
||||
|
||||
*E_out = 0;
|
||||
*P_out = 0;
|
||||
*L_out = 0;
|
||||
// perform Early, Prompt and Late correlation
|
||||
for(unsigned int i=0; i < num_points; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
|
||||
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
|
||||
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
|
||||
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
|
||||
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
@ -357,101 +358,101 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_
|
||||
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
|
||||
|
||||
|
||||
__m128 E_code_acc, P_code_acc, L_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
|
||||
__m128 output_ps;
|
||||
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
|
||||
|
||||
const lv_8sc_t* E_code_ptr = E_code;
|
||||
lv_32fc_t* E_out_ptr = E_out;
|
||||
const lv_8sc_t* L_code_ptr = L_code;
|
||||
lv_32fc_t* L_out_ptr = L_out;
|
||||
const lv_8sc_t* P_code_ptr = P_code;
|
||||
lv_32fc_t* P_out_ptr = P_out;
|
||||
|
||||
|
||||
*E_out_ptr = 0;
|
||||
*P_out_ptr = 0;
|
||||
*L_out_ptr = 0;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
E_code_acc = _mm_setzero_ps();
|
||||
L_code_acc = _mm_setzero_ps();
|
||||
P_code_acc = _mm_setzero_ps();
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
for(int number = 0;number < sse_iters; number++){
|
||||
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_load_si128((__m128i*)input_ptr);
|
||||
y = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
|
||||
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
|
||||
|
||||
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<2; ++i)
|
||||
{
|
||||
*E_out_ptr += E_dotProductVector[i];
|
||||
*P_out_ptr += P_dotProductVector[i];
|
||||
*L_out_ptr += L_dotProductVector[i];
|
||||
for(int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_load_si128((__m128i*)input_ptr);
|
||||
y = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
|
||||
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
|
||||
|
||||
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<2; ++i)
|
||||
{
|
||||
*E_out_ptr += E_dotProductVector[i];
|
||||
*P_out_ptr += P_dotProductVector[i];
|
||||
*L_out_ptr += L_dotProductVector[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
lv_8sc_t bb_signal_sample;
|
||||
for(int i=0; i < num_points%8; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
@ -474,104 +475,104 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E
|
||||
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
|
||||
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
|
||||
|
||||
|
||||
__m128 E_code_acc, P_code_acc, L_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32;
|
||||
__m128 output_ps_1, output_ps_2;
|
||||
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
|
||||
|
||||
const lv_8sc_t* E_code_ptr = E_code;
|
||||
lv_32fc_t* E_out_ptr = E_out;
|
||||
const lv_8sc_t* L_code_ptr = L_code;
|
||||
lv_32fc_t* L_out_ptr = L_out;
|
||||
const lv_8sc_t* P_code_ptr = P_code;
|
||||
lv_32fc_t* P_out_ptr = P_out;
|
||||
|
||||
|
||||
*E_out_ptr = 0;
|
||||
*P_out_ptr = 0;
|
||||
*L_out_ptr = 0;
|
||||
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
E_code_acc = _mm_setzero_ps();
|
||||
L_code_acc = _mm_setzero_ps();
|
||||
P_code_acc = _mm_setzero_ps();
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_load_si128((__m128i*)input_ptr);
|
||||
y = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
|
||||
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
|
||||
|
||||
//Get late values
|
||||
y = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
|
||||
|
||||
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<2; ++i)
|
||||
{
|
||||
*E_out_ptr += E_dotProductVector[i];
|
||||
*P_out_ptr += P_dotProductVector[i];
|
||||
*L_out_ptr += L_dotProductVector[i];
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_load_si128((__m128i*)input_ptr);
|
||||
y = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
|
||||
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
|
||||
|
||||
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
|
||||
|
||||
//Get early values
|
||||
y = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
|
||||
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
|
||||
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
|
||||
|
||||
//Get late values
|
||||
y = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
|
||||
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
|
||||
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
|
||||
|
||||
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (unsigned int i = 0; i<2; ++i)
|
||||
{
|
||||
*E_out_ptr += E_dotProductVector[i];
|
||||
*P_out_ptr += P_dotProductVector[i];
|
||||
*L_out_ptr += L_dotProductVector[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
lv_8sc_t bb_signal_sample;
|
||||
for(unsigned int i=0; i < num_points%8; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
@ -591,22 +592,22 @@ static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_o
|
||||
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
|
||||
{
|
||||
lv_8sc_t bb_signal_sample;
|
||||
|
||||
|
||||
bb_signal_sample = lv_cmake(0, 0);
|
||||
|
||||
|
||||
*E_out = 0;
|
||||
*P_out = 0;
|
||||
*L_out = 0;
|
||||
// perform Early, Prompt and Late correlation
|
||||
for(unsigned int i=0; i < num_points; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
|
||||
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
|
||||
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get early, late, and prompt values for each
|
||||
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
|
||||
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
|
||||
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h
|
||||
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127.
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that performs the carrier wipe-off mixing and the
|
||||
@ -47,7 +47,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -92,7 +92,7 @@
|
||||
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
|
||||
__m128i real_output, imag_output;
|
||||
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
|
||||
@ -102,10 +102,10 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_3
|
||||
__m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
|
||||
__m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
|
||||
__m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
|
||||
|
||||
const lv_8sc_t* VE_code_ptr = VE_code;
|
||||
lv_32fc_t* VE_out_ptr = VE_out;
|
||||
const lv_8sc_t* E_code_ptr = E_code;
|
||||
@ -116,7 +116,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_3
|
||||
lv_32fc_t* L_out_ptr = L_out;
|
||||
const lv_8sc_t* VL_code_ptr = VL_code;
|
||||
lv_32fc_t* VL_out_ptr = VL_out;
|
||||
|
||||
|
||||
float VE_out_real = 0;
|
||||
float VE_out_imag = 0;
|
||||
float E_out_real = 0;
|
||||
@ -127,7 +127,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_3
|
||||
float L_out_imag = 0;
|
||||
float VL_out_real = 0;
|
||||
float VL_out_imag = 0;
|
||||
|
||||
|
||||
real_VE_code_acc = _mm_setzero_ps();
|
||||
imag_VE_code_acc = _mm_setzero_ps();
|
||||
real_E_code_acc = _mm_setzero_ps();
|
||||
@ -138,126 +138,126 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_3
|
||||
imag_L_code_acc = _mm_setzero_ps();
|
||||
real_VL_code_acc = _mm_setzero_ps();
|
||||
imag_VL_code_acc = _mm_setzero_ps();
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
for(int number = 0;number < sse_iters; number++){
|
||||
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_lddqu_si128((__m128i*)input_ptr);
|
||||
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
|
||||
|
||||
x_abs = _mm_abs_epi8 (x);
|
||||
|
||||
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
|
||||
|
||||
imag_output = _mm_slli_si128 (imag_output, 1);
|
||||
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
|
||||
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
|
||||
|
||||
//Get very early values
|
||||
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
|
||||
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
|
||||
|
||||
//Get early values
|
||||
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
|
||||
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
|
||||
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
|
||||
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
|
||||
|
||||
//Get very late values
|
||||
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
|
||||
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
VE_code_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
VL_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
|
||||
|
||||
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<4; ++i)
|
||||
if (sse_iters>0)
|
||||
{
|
||||
VE_out_real += real_VE_dotProductVector[i];
|
||||
VE_out_imag += imag_VE_dotProductVector[i];
|
||||
E_out_real += real_E_dotProductVector[i];
|
||||
E_out_imag += imag_E_dotProductVector[i];
|
||||
P_out_real += real_P_dotProductVector[i];
|
||||
P_out_imag += imag_P_dotProductVector[i];
|
||||
L_out_real += real_L_dotProductVector[i];
|
||||
L_out_imag += imag_L_dotProductVector[i];
|
||||
VL_out_real += real_VL_dotProductVector[i];
|
||||
VL_out_imag += imag_VL_dotProductVector[i];
|
||||
for(int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_lddqu_si128((__m128i*)input_ptr);
|
||||
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
|
||||
|
||||
x_abs = _mm_abs_epi8 (x);
|
||||
|
||||
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
|
||||
|
||||
imag_output = _mm_slli_si128 (imag_output, 1);
|
||||
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
|
||||
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
|
||||
|
||||
//Get very early values
|
||||
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
|
||||
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
|
||||
|
||||
//Get early values
|
||||
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
|
||||
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
|
||||
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
|
||||
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
|
||||
|
||||
//Get very late values
|
||||
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
|
||||
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
VE_code_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
VL_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
|
||||
|
||||
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
|
||||
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<4; ++i)
|
||||
{
|
||||
VE_out_real += real_VE_dotProductVector[i];
|
||||
VE_out_imag += imag_VE_dotProductVector[i];
|
||||
E_out_real += real_E_dotProductVector[i];
|
||||
E_out_imag += imag_E_dotProductVector[i];
|
||||
P_out_real += real_P_dotProductVector[i];
|
||||
P_out_imag += imag_P_dotProductVector[i];
|
||||
L_out_real += real_L_dotProductVector[i];
|
||||
L_out_imag += imag_L_dotProductVector[i];
|
||||
VL_out_real += real_VL_dotProductVector[i];
|
||||
VL_out_imag += imag_VL_dotProductVector[i];
|
||||
}
|
||||
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
|
||||
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
|
||||
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
|
||||
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
|
||||
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
|
||||
}
|
||||
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
|
||||
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
|
||||
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
|
||||
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
|
||||
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
|
||||
}
|
||||
|
||||
|
||||
lv_16sc_t bb_signal_sample;
|
||||
for(int i=0; i < num_points%8; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get very early, early, prompt, late and very late values for each
|
||||
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
|
||||
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get very early, early, prompt, late and very late values for each
|
||||
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
|
||||
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
@ -288,18 +288,18 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32
|
||||
*VL_out = 0;
|
||||
|
||||
lv_16sc_t bb_signal_sample;
|
||||
|
||||
|
||||
for(unsigned int i=0; i < num_points; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get very early, early, prompt, late and very late values for each
|
||||
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
|
||||
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
|
||||
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
|
||||
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
|
||||
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get very early, early, prompt, late and very late values for each
|
||||
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
|
||||
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
|
||||
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
|
||||
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
|
||||
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */
|
||||
@ -337,20 +337,20 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32
|
||||
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 8;
|
||||
|
||||
|
||||
__m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
|
||||
__m128i real_output, imag_output;
|
||||
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
|
||||
__m128i input_i_1, input_i_2, output_i32;
|
||||
__m128 real_output_ps, imag_output_ps;
|
||||
|
||||
|
||||
__m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
|
||||
__m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
|
||||
__m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
|
||||
|
||||
const lv_8sc_t* input_ptr = input;
|
||||
const lv_8sc_t* carrier_ptr = carrier;
|
||||
|
||||
|
||||
const lv_8sc_t* VE_code_ptr = VE_code;
|
||||
lv_32fc_t* VE_out_ptr = VE_out;
|
||||
const lv_8sc_t* E_code_ptr = E_code;
|
||||
@ -361,7 +361,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_3
|
||||
lv_32fc_t* L_out_ptr = L_out;
|
||||
const lv_8sc_t* VL_code_ptr = VL_code;
|
||||
lv_32fc_t* VL_out_ptr = VL_out;
|
||||
|
||||
|
||||
float VE_out_real = 0;
|
||||
float VE_out_imag = 0;
|
||||
float E_out_real = 0;
|
||||
@ -372,7 +372,7 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_3
|
||||
float L_out_imag = 0;
|
||||
float VL_out_real = 0;
|
||||
float VL_out_imag = 0;
|
||||
|
||||
|
||||
real_VE_code_acc = _mm_setzero_ps();
|
||||
imag_VE_code_acc = _mm_setzero_ps();
|
||||
real_E_code_acc = _mm_setzero_ps();
|
||||
@ -383,126 +383,126 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_3
|
||||
imag_L_code_acc = _mm_setzero_ps();
|
||||
real_VL_code_acc = _mm_setzero_ps();
|
||||
imag_VL_code_acc = _mm_setzero_ps();
|
||||
|
||||
|
||||
if (sse_iters>0)
|
||||
{
|
||||
for(int number = 0;number < sse_iters; number++){
|
||||
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_load_si128((__m128i*)input_ptr);
|
||||
y = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
|
||||
x_abs = _mm_abs_epi8 (x);
|
||||
|
||||
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
|
||||
|
||||
imag_output = _mm_slli_si128 (imag_output, 1);
|
||||
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
|
||||
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
|
||||
|
||||
//Get very early values
|
||||
y = _mm_load_si128((__m128i*)VE_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
|
||||
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
|
||||
|
||||
//Get early values
|
||||
y = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
|
||||
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
|
||||
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
|
||||
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
|
||||
|
||||
//Get very late values
|
||||
y = _mm_load_si128((__m128i*)VL_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
|
||||
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
VE_code_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
VL_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
|
||||
|
||||
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<4; ++i)
|
||||
{
|
||||
VE_out_real += real_VE_dotProductVector[i];
|
||||
VE_out_imag += imag_VE_dotProductVector[i];
|
||||
E_out_real += real_E_dotProductVector[i];
|
||||
E_out_imag += imag_E_dotProductVector[i];
|
||||
P_out_real += real_P_dotProductVector[i];
|
||||
P_out_imag += imag_P_dotProductVector[i];
|
||||
L_out_real += real_L_dotProductVector[i];
|
||||
L_out_imag += imag_L_dotProductVector[i];
|
||||
VL_out_real += real_VL_dotProductVector[i];
|
||||
VL_out_imag += imag_VL_dotProductVector[i];
|
||||
for(int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
x = _mm_load_si128((__m128i*)input_ptr);
|
||||
y = _mm_load_si128((__m128i*)carrier_ptr);
|
||||
|
||||
x_abs = _mm_abs_epi8 (x);
|
||||
|
||||
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
|
||||
|
||||
imag_output = _mm_slli_si128 (imag_output, 1);
|
||||
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
|
||||
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
|
||||
|
||||
//Get very early values
|
||||
y = _mm_load_si128((__m128i*)VE_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
|
||||
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
|
||||
|
||||
//Get early values
|
||||
y = _mm_load_si128((__m128i*)E_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
|
||||
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
|
||||
|
||||
//Get prompt values
|
||||
y = _mm_load_si128((__m128i*)P_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
|
||||
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
|
||||
|
||||
//Get late values
|
||||
y = _mm_load_si128((__m128i*)L_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
|
||||
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
|
||||
|
||||
//Get very late values
|
||||
y = _mm_load_si128((__m128i*)VL_code_ptr);
|
||||
|
||||
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
|
||||
|
||||
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
|
||||
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
|
||||
|
||||
input_ptr += 8;
|
||||
carrier_ptr += 8;
|
||||
VE_code_ptr += 8;
|
||||
E_code_ptr += 8;
|
||||
P_code_ptr += 8;
|
||||
L_code_ptr += 8;
|
||||
VL_code_ptr += 8;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
|
||||
|
||||
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
|
||||
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
|
||||
|
||||
for (int i = 0; i<4; ++i)
|
||||
{
|
||||
VE_out_real += real_VE_dotProductVector[i];
|
||||
VE_out_imag += imag_VE_dotProductVector[i];
|
||||
E_out_real += real_E_dotProductVector[i];
|
||||
E_out_imag += imag_E_dotProductVector[i];
|
||||
P_out_real += real_P_dotProductVector[i];
|
||||
P_out_imag += imag_P_dotProductVector[i];
|
||||
L_out_real += real_L_dotProductVector[i];
|
||||
L_out_imag += imag_L_dotProductVector[i];
|
||||
VL_out_real += real_VL_dotProductVector[i];
|
||||
VL_out_imag += imag_VL_dotProductVector[i];
|
||||
}
|
||||
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
|
||||
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
|
||||
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
|
||||
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
|
||||
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
|
||||
}
|
||||
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
|
||||
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
|
||||
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
|
||||
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
|
||||
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
|
||||
}
|
||||
|
||||
|
||||
lv_16sc_t bb_signal_sample;
|
||||
for(int i=0; i < num_points%8; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get very early, early, prompt, late and very late values for each
|
||||
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
|
||||
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
|
||||
// Now get very early, early, prompt, late and very late values for each
|
||||
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
|
||||
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
|
||||
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
|
||||
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
|
||||
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
@ -531,20 +531,20 @@ static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_
|
||||
*P_out = 0;
|
||||
*L_out = 0;
|
||||
*VL_out = 0;
|
||||
|
||||
|
||||
lv_16sc_t bb_signal_sample;
|
||||
|
||||
|
||||
for(unsigned int i=0; i < num_points; ++i)
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get very early, early, prompt, late and very late values for each
|
||||
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
|
||||
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
|
||||
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
|
||||
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
|
||||
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
|
||||
}
|
||||
{
|
||||
//Perform the carrier wipe-off
|
||||
bb_signal_sample = input[i] * carrier[i];
|
||||
// Now get very early, early, prompt, late and very late values for each
|
||||
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
|
||||
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
|
||||
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
|
||||
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
|
||||
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */
|
||||
|
@ -2,7 +2,7 @@
|
||||
* \file volk_gnsssdr_8u_x2_multiply_8u.h
|
||||
* \brief Volk protokernel: multiplies unsigned char values
|
||||
* \authors <ul>
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* <li> Andres Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that multiplies unsigned char values (8 bits data)
|
||||
@ -19,7 +19,7 @@
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ -41,53 +41,54 @@
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
/*!
|
||||
\brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
|
||||
\brief Multiplies the two input unsigned char values and stores their results in the third unsigned char
|
||||
\param cChar The unsigned char where the results will be stored
|
||||
\param aChar One of the unsigned char to be multiplied
|
||||
\param bChar One of the unsigned char to be multiplied
|
||||
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
|
||||
*/
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
__m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
|
||||
unsigned char* c = cChar;
|
||||
const unsigned char* a = aChar;
|
||||
const unsigned char* b = bChar;
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
x = _mm_lddqu_si128((__m128i*)a);
|
||||
y = _mm_lddqu_si128((__m128i*)b);
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
x1 = _mm_srli_si128 (x, 1);
|
||||
x1 = _mm_and_si128 (x1, mult1);
|
||||
x2 = _mm_and_si128 (x, mult1);
|
||||
|
||||
y1 = _mm_srli_si128 (y, 1);
|
||||
y1 = _mm_and_si128 (y1, mult1);
|
||||
y2 = _mm_and_si128 (y, mult1);
|
||||
|
||||
x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
|
||||
x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
|
||||
|
||||
tmp = _mm_and_si128 (x1_mult_y1, mult1);
|
||||
tmp1 = _mm_slli_si128 (tmp, 1);
|
||||
tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
|
||||
totalc = _mm_or_si128 (tmp1, tmp2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 16;
|
||||
b += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_lddqu_si128((__m128i*)a);
|
||||
y = _mm_lddqu_si128((__m128i*)b);
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
x1 = _mm_srli_si128 (x, 1);
|
||||
x1 = _mm_and_si128 (x1, mult1);
|
||||
x2 = _mm_and_si128 (x, mult1);
|
||||
|
||||
y1 = _mm_srli_si128 (y, 1);
|
||||
y1 = _mm_and_si128 (y1, mult1);
|
||||
y2 = _mm_and_si128 (y, mult1);
|
||||
|
||||
x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
|
||||
x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
|
||||
|
||||
tmp = _mm_and_si128 (x1_mult_y1, mult1);
|
||||
tmp1 = _mm_slli_si128 (tmp, 1);
|
||||
tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
|
||||
totalc = _mm_or_si128 (tmp1, tmp2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 16;
|
||||
b += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
@ -99,14 +100,16 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
|
||||
\param bChar One of the unsigned char to be multiplied
|
||||
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
|
||||
*/
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
|
||||
{
|
||||
unsigned char* cPtr = cChar;
|
||||
const unsigned char* aPtr = aChar;
|
||||
const unsigned char* bPtr = bChar;
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -129,47 +132,48 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar,
|
||||
\param bChar One of the unsigned char to be multiplied
|
||||
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
|
||||
*/
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
|
||||
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 16;
|
||||
|
||||
|
||||
__m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
|
||||
unsigned char* c = cChar;
|
||||
const unsigned char* a = aChar;
|
||||
const unsigned char* b = bChar;
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++){
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
x1 = _mm_srli_si128 (x, 1);
|
||||
x1 = _mm_and_si128 (x1, mult1);
|
||||
x2 = _mm_and_si128 (x, mult1);
|
||||
|
||||
y1 = _mm_srli_si128 (y, 1);
|
||||
y1 = _mm_and_si128 (y1, mult1);
|
||||
y2 = _mm_and_si128 (y, mult1);
|
||||
|
||||
x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
|
||||
x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
|
||||
|
||||
tmp = _mm_and_si128 (x1_mult_y1, mult1);
|
||||
tmp1 = _mm_slli_si128 (tmp, 1);
|
||||
tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
|
||||
totalc = _mm_or_si128 (tmp1, tmp2);
|
||||
|
||||
_mm_store_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 16;
|
||||
b += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
|
||||
for(unsigned int number = 0;number < sse_iters; number++)
|
||||
{
|
||||
x = _mm_load_si128((__m128i*)a);
|
||||
y = _mm_load_si128((__m128i*)b);
|
||||
|
||||
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
|
||||
x1 = _mm_srli_si128 (x, 1);
|
||||
x1 = _mm_and_si128 (x1, mult1);
|
||||
x2 = _mm_and_si128 (x, mult1);
|
||||
|
||||
y1 = _mm_srli_si128 (y, 1);
|
||||
y1 = _mm_and_si128 (y1, mult1);
|
||||
y2 = _mm_and_si128 (y, mult1);
|
||||
|
||||
x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
|
||||
x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
|
||||
|
||||
tmp = _mm_and_si128 (x1_mult_y1, mult1);
|
||||
tmp1 = _mm_slli_si128 (tmp, 1);
|
||||
tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
|
||||
totalc = _mm_or_si128 (tmp1, tmp2);
|
||||
|
||||
_mm_store_si128((__m128i*)c, totalc);
|
||||
|
||||
a += 16;
|
||||
b += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i<(num_points % 16); ++i)
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
@ -181,14 +185,16 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
|
||||
\param bChar One of the unsigned char to be multiplied
|
||||
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
|
||||
*/
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
|
||||
{
|
||||
unsigned char* cPtr = cChar;
|
||||
const unsigned char* aPtr = aChar;
|
||||
const unsigned char* bPtr = bChar;
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
|
||||
for(unsigned int number = 0; number < num_points; number++)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
@ -201,7 +207,8 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar
|
||||
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
|
||||
*/
|
||||
extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points)
|
||||
{
|
||||
volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points);
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user