1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-01-16 20:23:02 +00:00

Merge branch 'c98' into next

Now VOLK_GNSSSDR is built with the default c98 standard instead of c11.
This allows the usage of more compilers
This commit is contained in:
Carles Fernandez 2016-05-31 22:46:02 +02:00
commit d490191dbc
32 changed files with 844 additions and 674 deletions

View File

@ -27,8 +27,8 @@ enable_language(CXX)
enable_language(C)
enable_testing()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11 -Wall")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
option(ENABLE_STRIP "Create a stripped volk_gnsssdr_profile binary (without shared libraries)" OFF)

View File

@ -62,7 +62,8 @@
static inline void volk_gnsssdr_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
{
for(unsigned int i = 0; i < num_points; i++)
unsigned int i;
for(i = 0; i < num_points; i++)
{
outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
}
@ -76,22 +77,19 @@ static inline void volk_gnsssdr_16ic_convert_32fc_generic(lv_32fc_t* outputVecto
static inline void volk_gnsssdr_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 2;
unsigned int i;
const lv_16sc_t* _in = inputVector;
lv_32fc_t* _out = outputVector;
__m128 a;
for(unsigned int number = 0; number < sse_iters; number++)
for(i = 0; i < sse_iters; i++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm_store_ps((float*)_out, a);
_in += 2;
_out += 2;
//*_out++ = lv_cmake((float)lv_creal(*_in),(float)lv_cimag(*_in));
//_in++;
//*_out++ = lv_cmake((float)lv_creal(*_in),(float)lv_cimag(*_in));
//_in++;
}
for (unsigned int i = 0; i < (num_points % 2); ++i)
for (i = 0; i < (num_points % 2); ++i)
{
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
_in++;
@ -106,18 +104,19 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector
static inline void volk_gnsssdr_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 2;
unsigned int i;
const lv_16sc_t* _in = inputVector;
lv_32fc_t* _out = outputVector;
__m128 a;
for(unsigned int number = 0; number < sse_iters; number++)
for(i = 0; i < sse_iters; i++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm_storeu_ps((float*)_out, a);
_in += 2;
_out += 2;
}
for (unsigned int i = 0; i < (num_points % 2); ++i)
for (i = 0; i < (num_points % 2); ++i)
{
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
_in++;
@ -132,11 +131,12 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector
static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
unsigned int i;
const lv_16sc_t* _in = inputVector;
lv_32fc_t* _out = outputVector;
__m256 a;
for(unsigned int number = 0; number < sse_iters; number++)
for(i = 0; i < sse_iters; i++)
{
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm256_storeu_ps((float*)_out, a);
@ -144,7 +144,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector,
_out += 4;
}
_mm256_zeroupper();
for (unsigned int i = 0; i < (num_points % 4); ++i)
for(i = 0; i < (num_points % 4); ++i)
{
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
_in++;
@ -158,11 +158,12 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector,
static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
unsigned int i;
const lv_16sc_t* _in = inputVector;
lv_32fc_t* _out = outputVector;
__m256 a;
for(unsigned int number = 0; number < sse_iters; number++)
for(i = 0; i < sse_iters; i++)
{
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm256_store_ps((float*)_out, a);
@ -170,7 +171,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector,
_out += 4;
}
_mm256_zeroupper();
for (unsigned int i = 0; i < (num_points % 4); ++i)
for(i = 0; i < (num_points % 4); ++i)
{
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
_in++;
@ -185,7 +186,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector,
static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 2;
unsigned int i;
const lv_16sc_t* _in = inputVector;
lv_32fc_t* _out = outputVector;
@ -193,7 +194,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
int32x4_t a32x4;
float32x4_t f32x4;
for(unsigned int number = 0; number < sse_iters; number++)
for(i = 0; i < sse_iters; i++)
{
a16x4 = vld1_s16((const int16_t*)_in);
__builtin_prefetch(_in + 4);
@ -203,7 +204,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
_in += 2;
_out += 2;
}
for (unsigned int i = 0; i < (num_points % 2); ++i)
for (i = 0; i < (num_points % 2); ++i)
{
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
_in++;

View File

@ -71,8 +71,9 @@
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)
{
int local_code_chip_index;
unsigned int n;
//fesetround(FE_TONEAREST);
for (unsigned int n = 0; n < num_output_samples; n++)
for (n = 0; n < num_output_samples; n++)
{
// resample code for current tap
local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f);

View File

@ -47,10 +47,11 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_generic(lv_16sc_
float code_phase_step_chips = 0.1;
int code_length_chips = 2046;
int num_out_vectors = 3;
unsigned int n;
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
@ -59,7 +60,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_generic(lv_16sc_
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -75,9 +76,11 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_a_sse2(lv_16sc_t
float code_phase_step_chips = 0.1;
int code_length_chips = 2046;
int num_out_vectors = 3;
unsigned int n;
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
@ -86,7 +89,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_a_sse2(lv_16sc_t
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -102,9 +105,11 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_u_sse2(lv_16sc_t
float code_phase_step_chips = 0.1;
int code_length_chips = 2046;
int num_out_vectors = 3;
unsigned int n;
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
@ -113,7 +118,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_u_sse2(lv_16sc_t
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -129,9 +134,11 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_neon(lv_16sc_t*
float code_phase_step_chips = 0.1;
int code_length_chips = 2046;
int num_out_vectors = 3;
unsigned int n;
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
@ -140,7 +147,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_neon(lv_16sc_t*
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}

View File

@ -47,12 +47,12 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* r
float code_phase_step_chips = -0.6;
int code_length_chips = 2046;
int num_out_vectors = 3;
unsigned int n;
float rem_code_phase_chips = -0.234;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -61,7 +61,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* r
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -78,11 +78,11 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* re
int code_length_chips = 2046;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -91,7 +91,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* re
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -107,11 +107,11 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* re
int code_length_chips = 2046;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -120,7 +120,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* re
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -137,11 +137,11 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t*
int code_length_chips = 2046;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -150,7 +150,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t*
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -167,11 +167,11 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t*
int code_length_chips = 2046;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -180,7 +180,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t*
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -197,11 +197,11 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* res
int code_length_chips = 2046;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -210,7 +210,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* res
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -227,11 +227,11 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res
int code_length_chips = 2046;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -240,7 +240,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -257,11 +257,11 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* resu
int code_length_chips = 2046;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -270,7 +270,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* resu
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}

View File

@ -139,6 +139,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
unsigned int number;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
@ -151,14 +152,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
const lv_16sc_t* _in = inVector;
lv_16sc_t* _out = outVector;
__m128 yl, yh, tmp1, tmp2, tmp3;
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -221,7 +221,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
for (number = sse_iters * 4; number < num_points; ++number)
{
tmp16 = *_in++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
@ -241,6 +241,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
{
const unsigned int sse_iters = num_points / 4;
const unsigned int ROTATOR_RELOAD = 512;
unsigned int n;
unsigned int j;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
@ -260,9 +262,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for (unsigned int n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
{
for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
for (j = 0; j < ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -319,7 +321,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
}
for (unsigned int j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -372,7 +374,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
for (n = sse_iters * 4; n < num_points; ++n)
{
tmp16 = *_in++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
@ -391,6 +393,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
unsigned int number;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
@ -410,7 +413,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -473,7 +476,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
for (number = sse_iters * 4; number < num_points; ++number)
{
tmp16 = *_in++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
@ -492,6 +495,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
{
const unsigned int sse_iters = num_points / 4;
unsigned int ROTATOR_RELOAD = 512;
unsigned int n;
unsigned int j;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
@ -511,9 +516,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for (unsigned int n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
{
for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
for (j = 0; j < ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -570,7 +575,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
}
for (unsigned int j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -623,7 +628,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
for (n = sse_iters * 4; n < num_points; ++n)
{
tmp16 = *_in++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
@ -773,6 +778,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
unsigned int i = 0;
const unsigned int neon_iters = num_points / 4;
const unsigned int ROTATOR_RELOAD = 512;
unsigned int n;
unsigned int j;
lv_16sc_t tmp16_;
lv_32fc_t tmp32_;
@ -809,9 +816,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
if (neon_iters > 0)
{
for (unsigned int n = 0; n < neon_iters / ROTATOR_RELOAD; n++)
for (n = 0; n < neon_iters / ROTATOR_RELOAD; n++)
{
for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
for (j = 0; j < ROTATOR_RELOAD; j++)
{
/* load 4 complex numbers (int 16 bits each component) */
tmp16 = vld2_s16((int16_t*)_in);
@ -880,7 +887,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
_phase_imag = vld1q_f32(____phase_imag);
}
for (unsigned int j = 0; j < neon_iters % ROTATOR_RELOAD; j++)
for (j = 0; j < neon_iters % ROTATOR_RELOAD; j++)
{
/* load 4 complex numbers (int 16 bits each component) */
tmp16 = vld2_s16((int16_t*)_in);

View File

@ -69,7 +69,8 @@
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{
result[0] = lv_cmake((int16_t)0, (int16_t)0);
for (unsigned int n = 0; n < num_points; n++)
unsigned int n;
for (n = 0; n < num_points; n++)
{
lv_16sc_t tmp = in_a[n] * in_b[n];
result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) ));
@ -87,7 +88,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
const unsigned int sse_iters = num_points / 4;
unsigned int number;
const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
@ -103,7 +104,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
@ -137,13 +138,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
for (int i = 0; i < 4; ++i)
for (number = 0; number < 4; ++number)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
}
}
for (unsigned int i = 0; i < (num_points % 4); ++i)
for (number = 0; number < (num_points % 4); ++number)
{
lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
@ -168,6 +169,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
unsigned int i;
unsigned int number;
if (sse_iters > 0)
{
@ -180,7 +182,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
@ -246,6 +248,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
unsigned int i;
unsigned int number;
if (avx_iters > 0)
{
@ -258,7 +261,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0; number < avx_iters; number++)
for(number = 0; number < avx_iters; number++)
{
a = _mm256_loadu_si256((__m256i*)_in_a);
__builtin_prefetch(_in_a + 16);
@ -322,6 +325,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
unsigned int i;
unsigned int number;
if (avx_iters > 0)
{
@ -334,7 +338,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0; number < avx_iters; number++)
for(number = 0; number < avx_iters; number++)
{
a = _mm256_load_si256((__m256i*)_in_a);
__builtin_prefetch(_in_a + 16);
@ -438,9 +442,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
}
vst2_s16((int16_t*)accum_result, accumulator);
for (unsigned int i = 0; i < 4; ++i)
for (number = 0; number < 4; ++number)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[i])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[number])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[number])));
}
*out = dotProduct;

View File

@ -70,10 +70,12 @@
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
int n_vec;
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
for (unsigned int n = 0; n < num_points; n++)
for (n = 0; n < num_points; n++)
{
//r*a.r - i*a.i, i*a.r + r*a.i
//result[n_vec]+=in_common[n]*in_a[n_vec][n];
@ -90,10 +92,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
int n_vec;
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
for (unsigned int n = 0; n < num_points; n++)
for (n = 0; n < num_points; n++)
{
lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))),
sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n]))));
@ -111,7 +115,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t*
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
int n_vec;
unsigned int index;
const unsigned int sse_iters = num_points / 4;
const lv_16sc_t** _in_a = in_a;
@ -125,7 +130,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_setzero_si128();
imagcacc[n_vec] = _mm_setzero_si128();
@ -136,14 +141,14 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(index = 0; index < sse_iters; index++)
{
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__builtin_prefetch(_in_common + 8);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -160,12 +165,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
}
_in_common += 4;
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
@ -174,10 +178,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -185,11 +189,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
volk_gnsssdr_free(imagcacc);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = sse_iters * 4; n < num_points; n++)
for(index = sse_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
@ -205,7 +209,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
int n_vec;
unsigned int index;
const unsigned int sse_iters = num_points / 4;
const lv_16sc_t** _in_a = in_a;
@ -219,7 +224,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_setzero_si128();
imagcacc[n_vec] = _mm_setzero_si128();
@ -230,14 +235,14 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(index = 0; index < sse_iters; index++)
{
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__builtin_prefetch(_in_common + 8);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
@ -258,7 +263,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
_in_common += 4;
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
@ -267,10 +272,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -278,11 +283,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
volk_gnsssdr_free(imagcacc);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = sse_iters * 4; n < num_points; n++)
for(index = sse_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
@ -298,7 +303,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
int n_vec;
unsigned int index;
const unsigned int sse_iters = num_points / 8;
const lv_16sc_t** _in_a = in_a;
@ -312,7 +318,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm256_setzero_si256();
imagcacc[n_vec] = _mm256_setzero_si256();
@ -323,13 +329,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(index = 0; index < sse_iters; index++)
{
b = _mm256_load_si256((__m256i*)_in_common);
__builtin_prefetch(_in_common + 16);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][number*8]));
a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index*8]));
c = _mm256_mullo_epi16(a, b);
@ -350,7 +356,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
_in_common += 8;
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm256_and_si256(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm256_and_si256(imagcacc[n_vec], mask_imag);
@ -359,10 +365,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 8; ++i)
for (index = 0; index < 8; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -371,11 +377,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
}
_mm256_zeroupper();
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = sse_iters * 8; n < num_points; n++)
for(index = sse_iters * 8; index < num_points; index++)
{
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
@ -393,7 +399,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
lv_16sc_t dotProduct = lv_cmake(0,0);
const unsigned int sse_iters = num_points / 8;
int n_vec;
unsigned int index;
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
@ -405,7 +412,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm256_setzero_si256();
imagcacc[n_vec] = _mm256_setzero_si256();
@ -416,13 +423,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(index = 0; index < sse_iters; index++)
{
b = _mm256_loadu_si256((__m256i*)_in_common);
__builtin_prefetch(_in_common + 16);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][number*8]));
a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index*8]));
c = _mm256_mullo_epi16(a, b);
@ -443,7 +450,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
_in_common += 8;
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm256_and_si256(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm256_and_si256(imagcacc[n_vec], mask_imag);
@ -452,10 +459,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 8; ++i)
for (index = 0; index < 8; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -464,11 +471,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
}
_mm256_zeroupper();
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = sse_iters * 8; n < num_points; n++)
for(index = sse_iters * 8; index < num_points; index++)
{
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
@ -484,7 +491,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
int n_vec;
unsigned int index;
const unsigned int neon_iters = num_points / 4;
const lv_16sc_t** _in_a = in_a;
@ -501,20 +509,20 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
int16x4x2_t tmp_real, tmp_imag;
for(int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator[n_vec].val[0] = vdup_n_s16(0);
accumulator[n_vec].val[1] = vdup_n_s16(0);
}
for(unsigned int number = 0; number < neon_iters; number++)
for(index = 0; index < neon_iters; index++)
{
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__builtin_prefetch(_in_common + 8);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//__builtin_prefetch(&_in_a[n_vec][number*4] + 8);
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//__builtin_prefetch(&_in_a[n_vec][index*4] + 8);
// multiply the real*real and imag*imag to get real result
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
@ -537,25 +545,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
_in_common += 4;
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
volk_gnsssdr_free(accumulator);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = neon_iters * 4; n < num_points; n++)
for(index = neon_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
@ -573,7 +581,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
lv_16sc_t dotProduct = lv_cmake(0,0);
const unsigned int neon_iters = num_points / 4;
int n_vec;
unsigned int index;
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
@ -586,19 +595,19 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
for(int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator[n_vec].val[0] = vdup_n_s16(0);
accumulator[n_vec].val[1] = vdup_n_s16(0);
}
for(unsigned int number = 0; number < neon_iters; number++)
for(index = 0; index < neon_iters; index++)
{
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__builtin_prefetch(_in_common + 8);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4]));
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4]));
tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
@ -613,25 +622,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
_in_common += 4;
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
volk_gnsssdr_free(accumulator);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = neon_iters * 4; n < num_points; n++)
for(index = neon_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
@ -649,7 +658,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
lv_16sc_t dotProduct = lv_cmake(0,0);
const unsigned int neon_iters = num_points / 4;
int n_vec;
unsigned int index;
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
@ -663,7 +673,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
for(int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator1[n_vec].val[0] = vdup_n_s16(0);
accumulator1[n_vec].val[1] = vdup_n_s16(0);
@ -671,13 +681,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
accumulator2[n_vec].val[1] = vdup_n_s16(0);
}
for(unsigned int number = 0; number < neon_iters; number++)
for(index = 0; index < neon_iters; index++)
{
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__builtin_prefetch(_in_common + 8);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4]));
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4]));
accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]);
accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]);
@ -687,20 +697,20 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
_in_common += 4;
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator1[n_vec].val[0] = vqadd_s16(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]);
accumulator1[n_vec].val[1] = vqadd_s16(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -708,11 +718,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
volk_gnsssdr_free(accumulator2);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(unsigned int n = neon_iters * 4; n < num_points; n++)
for(index = neon_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[n] * in_a[n_vec][n];
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));

View File

@ -46,7 +46,8 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t*
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -54,7 +55,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t*
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -69,7 +70,8 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic_sat(lv_16sc
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -77,7 +79,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic_sat(lv_16sc
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -92,7 +94,8 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -100,7 +103,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -116,7 +119,8 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
@ -124,7 +128,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -140,7 +144,8 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* r
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
@ -148,7 +153,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* r
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -164,7 +169,8 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* r
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
@ -172,7 +178,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* r
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -188,7 +194,8 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* res
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
@ -196,7 +203,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* res
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -212,7 +219,8 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t*
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
@ -220,7 +228,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t*
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -235,7 +243,8 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc
{
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
@ -243,7 +252,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}

View File

@ -65,7 +65,8 @@
static inline void volk_gnsssdr_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{
for (unsigned int n = 0; n < num_points; n++)
unsigned int n;
for (n = 0; n < num_points; n++)
{
//r*a.r - i*a.i, i*a.r + r*a.i
result[n] = in_a[n] * in_b[n];
@ -81,6 +82,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_generic(lv_16sc_t* result,
static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
unsigned int number;
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
@ -89,7 +91,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con
const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
@ -120,7 +122,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con
_out += 4;
}
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
for (number = sse_iters * 4; number < num_points; ++number)
{
*_out++ = (*_in_a++) * (*_in_b++);
}
@ -134,6 +136,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con
static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
unsigned int number;
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
@ -142,7 +145,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
@ -173,7 +176,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
_out += 4;
}
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
for (number = sse_iters * 4; number < num_points; ++number)
{
*_out++ = (*_in_a++) * (*_in_b++);
}

View File

@ -82,11 +82,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc
{
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
int n_vec;
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
}
for (unsigned int n = 0; n < num_points; n++)
for (n = 0; n < num_points; n++)
{
tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
@ -105,7 +107,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc
}
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
@ -123,21 +125,24 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(
{
lv_16sc_t tmp16;
lv_32fc_t tmp32;
int n_vec;
unsigned int n;
unsigned int j;
const unsigned int ROTATOR_RELOAD = 256;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
}
for (unsigned int n = 0; n < num_points / ROTATOR_RELOAD; n++)
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
{
for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
for (j = 0; j < ROTATOR_RELOAD; j++)
{
tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
@ -153,13 +158,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(
#endif
}
for (unsigned int j = 0; j < num_points % ROTATOR_RELOAD; j++)
for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
{
tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
lv_16sc_t tmp = tmp16 * in_a[n_vec][ (num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j ];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
@ -179,7 +184,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
lv_16sc_t dotProduct = lv_cmake(0,0);
const unsigned int sse_iters = num_points / 4;
int n_vec;
int i;
unsigned int number;
unsigned int n;
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
@ -189,7 +197,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_setzero_si128();
imagcacc[n_vec] = _mm_setzero_si128();
@ -215,7 +223,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
// Phase rotation on operand in_common starts here:
//printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
@ -264,7 +272,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
//next two samples
_in_common += 2;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
@ -295,7 +303,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
@ -304,7 +312,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
@ -324,14 +332,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
//(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
(*phase) = two_phase_acc[0];
for(unsigned int n = sse_iters * 4; n < num_points; n++)
for(n = sse_iters * 4; n < num_points; n++)
{
tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
@ -352,6 +360,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
const unsigned int sse_iters = num_points / 4;
const unsigned int ROTATOR_RELOAD = 128;
int n_vec;
int i;
unsigned int number;
unsigned int j;
unsigned int n;
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
@ -362,7 +375,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_setzero_si128();
imagcacc[n_vec] = _mm_setzero_si128();
@ -388,9 +401,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for (unsigned int number = 0; number < sse_iters / ROTATOR_RELOAD; ++number)
for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number)
{
for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
for (j = 0; j < ROTATOR_RELOAD; j++)
{
// Phase rotation on operand in_common starts here:
//printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
@ -439,7 +452,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
//next two samples
_in_common += 2;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
@ -468,7 +481,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
}
for (unsigned int j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -515,7 +528,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
//next two samples
_in_common += 2;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
@ -537,7 +550,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
@ -546,7 +559,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
@ -567,14 +580,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
//(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
(*phase) = two_phase_acc[0];
for(unsigned int n = sse_iters * 4; n < num_points; n++)
for(n = sse_iters * 4; n < num_points; n++)
{
tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
//lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
@ -595,7 +608,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
lv_16sc_t dotProduct = lv_cmake(0,0);
const unsigned int sse_iters = num_points / 4;
int n_vec;
unsigned int number;
unsigned int j;
unsigned int n;
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
@ -605,7 +621,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_setzero_si128();
imagcacc[n_vec] = _mm_setzero_si128();
@ -631,7 +647,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
// Phase rotation on operand in_common starts here:
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
@ -680,7 +696,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
//next two samples
_in_common += 2;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
@ -711,7 +727,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
@ -720,10 +736,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (j = 0; j < 4; ++j)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[j])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j])));
}
_out[n_vec] = dotProduct;
}
@ -733,13 +749,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for(unsigned int n = sse_iters * 4; n < num_points; n++)
for(n = sse_iters * 4; n < num_points; n++)
{
tmp16 = in_common[n];
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
@ -759,6 +775,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
int n_vec;
unsigned int number;
unsigned int n;
lv_16sc_t tmp16;
lv_32fc_t tmp32;
@ -769,7 +788,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm256_setzero_si256();
imagcacc[n_vec] = _mm256_setzero_si256();
@ -793,7 +812,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
__m128 yl, yh, tmp1, tmp2, tmp3;
for(unsigned int number = 0; number < avx2_iters; number++)
for(number = 0; number < avx2_iters; number++)
{
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -880,7 +899,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
_in_common += 2;
b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][number * 8]));
@ -911,7 +930,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm256_and_si256(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm256_and_si256(imagcacc[n_vec], mask_imag);
@ -920,10 +939,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
_mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 8; ++i)
for (number = 0; number < 8; ++number)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
}
_out[n_vec] = dotProduct;
}
@ -935,13 +954,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for(unsigned int n = avx2_iters * 8; n < num_points; n++)
for(n = avx2_iters * 8; n < num_points; n++)
{
tmp16 = in_common[n];
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
@ -964,7 +983,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
int n_vec;
unsigned int number;
unsigned int n;
unsigned int j;
lv_16sc_t tmp16;
lv_32fc_t tmp32;
@ -974,7 +996,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm256_setzero_si256();
imagcacc[n_vec] = _mm256_setzero_si256();
@ -998,9 +1020,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
__m128 yl, yh, tmp1, tmp2, tmp3;
for (unsigned int number = 0; number < avx2_iters / ROTATOR_RELOAD; ++number)
for (number = 0; number < avx2_iters / ROTATOR_RELOAD; ++number)
{
for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
for (j = 0; j < ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -1087,7 +1109,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
_in_common += 2;
b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 8]));
@ -1116,7 +1138,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
}
for (unsigned int j = 0; j < avx2_iters % ROTATOR_RELOAD; j++)
for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
@ -1203,7 +1225,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
_in_common += 2;
b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][((avx2_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 8]));
@ -1225,7 +1247,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
realcacc[n_vec] = _mm256_and_si256(realcacc[n_vec], mask_real);
imagcacc[n_vec] = _mm256_and_si256(imagcacc[n_vec], mask_imag);
@ -1234,10 +1256,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
_mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 8; ++i)
for (j = 0; j < 8; ++j)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[j])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j])));
}
_out[n_vec] = dotProduct;
}
@ -1248,13 +1270,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for(unsigned int n = avx2_iters * 8; n < num_points; n++)
for(n = avx2_iters * 8; n < num_points; n++)
{
tmp16 = in_common[n];
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
@ -1276,7 +1298,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
int n_vec;
int i;
unsigned int number;
unsigned int n;
lv_16sc_t tmp16_, tmp;
lv_32fc_t tmp32_;
@ -1315,13 +1340,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
for(int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator[n_vec].val[0] = vdup_n_s16(0);
accumulator[n_vec].val[1] = vdup_n_s16(0);
}
for(unsigned int number = 0; number < neon_iters; number++)
for(number = 0; number < neon_iters; number++)
{
/* load 4 complex numbers (int 16 bits each component) */
tmp16 = vld2_s16((int16_t*)_in_common);
@ -1370,7 +1395,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//__builtin_prefetch(&_in_a[n_vec][number*4] + 8);
@ -1410,11 +1435,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
@ -1428,13 +1453,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
}
for (unsigned int n = neon_iters * 4; n < num_points; n++)
for (n = neon_iters * 4; n < num_points; n++)
{
tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp = tmp16_ * in_a[n_vec][n];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
@ -1456,7 +1481,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
int n_vec;
int i;
unsigned int number;
unsigned int n;
lv_16sc_t tmp16_, tmp;
lv_32fc_t tmp32_;
@ -1495,13 +1523,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
for(int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator[n_vec].val[0] = vdup_n_s16(0);
accumulator[n_vec].val[1] = vdup_n_s16(0);
}
for(unsigned int number = 0; number < neon_iters; number++)
for(number = 0; number < neon_iters; number++)
{
/* load 4 complex numbers (int 16 bits each component) */
tmp16 = vld2_s16((int16_t*)_in_common);
@ -1581,7 +1609,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4]));
@ -1597,11 +1625,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
@ -1616,13 +1644,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
}
for (unsigned int n = neon_iters * 4; n < num_points; n++)
for (n = neon_iters * 4; n < num_points; n++)
{
tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp = tmp16_ * in_a[n_vec][n];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
@ -1644,7 +1672,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
const lv_16sc_t** _in_a = in_a;
const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result;
int n_vec;
int i;
unsigned int number;
unsigned int n;
lv_16sc_t tmp16_, tmp;
lv_32fc_t tmp32_;
@ -1683,7 +1714,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
for(int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator1[n_vec].val[0] = vdup_n_s16(0);
accumulator1[n_vec].val[1] = vdup_n_s16(0);
@ -1691,7 +1722,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
accumulator2[n_vec].val[1] = vdup_n_s16(0);
}
for(unsigned int number = 0; number < neon_iters; number++)
for(number = 0; number < neon_iters; number++)
{
/* load 4 complex numbers (int 16 bits each component) */
b_val = vld2_s16((int16_t*)_in_common);
@ -1759,7 +1790,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
_phase_imag = vld1q_f32(____phase_imag);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4]));
@ -1770,16 +1801,16 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
accumulator2[n_vec].val[1] = vmla_s16(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]);
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator1[n_vec].val[0] = vqadd_s16(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]);
accumulator1[n_vec].val[1] = vqadd_s16(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
@ -1795,13 +1826,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
}
for (unsigned int n = neon_iters * 4; n < num_points; n++)
for (n = neon_iters * 4; n < num_points; n++)
{
tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp = tmp16_ * in_a[n_vec][n];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));

View File

@ -50,17 +50,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -80,17 +80,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_rel
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -110,10 +110,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_1
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -121,7 +121,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_1
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -141,10 +141,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3_relo
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -152,7 +152,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3_relo
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -172,10 +172,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -183,7 +183,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -203,10 +203,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2(lv_1
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -214,7 +214,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2(lv_1
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -234,10 +234,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2_relo
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -245,7 +245,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2_relo
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -265,10 +265,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2(lv_1
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -276,7 +276,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2(lv_1
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -296,10 +296,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2_relo
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -307,7 +307,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2_relo
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -327,10 +327,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16s
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -338,7 +338,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16s
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -358,10 +358,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon_vma(lv
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
@ -369,7 +369,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon_vma(lv
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}

View File

@ -74,9 +74,11 @@
static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{
int local_code_chip_index;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
int current_correlator_tap;
int n;
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (int n = 0; n < num_points; n++)
for (n = 0; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -97,7 +99,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
{
lv_16sc_t** _result = result;
const unsigned int quarterPoints = num_points / 4;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m128 fours = _mm_set1_ps(4.0f);
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
@ -111,12 +115,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
__m128i local_code_chip_index_reg, aux_i, negatives, i;
__m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(unsigned int n = 0; n < quarterPoints; n++)
for(n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -134,13 +138,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
for(n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -161,7 +165,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
{
lv_16sc_t** _result = result;
const unsigned int quarterPoints = num_points / 4;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m128 fours = _mm_set1_ps(4.0f);
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
@ -175,12 +181,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
__m128i local_code_chip_index_reg, aux_i, negatives, i;
__m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(unsigned int n = 0; n < quarterPoints; n++)
for(n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -198,13 +204,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
for(n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -224,8 +230,10 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{
lv_16sc_t** _result = result;
const unsigned int quarterPoints = num_points / 4;
const unsigned int quarterPoints = num_points / 4;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m128 ones = _mm_set1_ps(1.0f);
const __m128 fours = _mm_set1_ps(4.0f);
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
@ -240,12 +248,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
__m128i local_code_chip_index_reg, aux_i, negatives, i;
__m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(unsigned int n = 0; n < quarterPoints; n++)
for(n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -266,13 +274,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
for(n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -293,7 +301,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
{
lv_16sc_t** _result = result;
const unsigned int quarterPoints = num_points / 4;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m128 ones = _mm_set1_ps(1.0f);
const __m128 fours = _mm_set1_ps(4.0f);
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
@ -308,12 +318,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
__m128i local_code_chip_index_reg, aux_i, negatives, i;
__m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(unsigned int n = 0; n < quarterPoints; n++)
for(n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -334,13 +344,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
for(n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -360,8 +370,10 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{
lv_16sc_t** _result = result;
const unsigned int avx_iters = num_points / 8;
const unsigned int avx_iters = num_points / 8;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m256 eights = _mm256_set1_ps(8.0f);
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
@ -376,12 +388,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
__m256i local_code_chip_index_reg, i;
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(unsigned int n = 0; n < avx_iters; n++)
for(n = 0; n < avx_iters; n++)
{
__builtin_prefetch(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__builtin_prefetch(&local_code_chip_index[8], 1, 3);
@ -405,7 +417,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 8; ++k)
for(k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -413,9 +425,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
}
}
_mm256_zeroupper();
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(unsigned int n = avx_iters * 8; n < num_points; n++)
for(n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -436,7 +448,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
{
lv_16sc_t** _result = result;
const unsigned int avx_iters = num_points / 8;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m256 eights = _mm256_set1_ps(8.0f);
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
@ -451,12 +465,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
__m256i local_code_chip_index_reg, i;
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(unsigned int n = 0; n < avx_iters; n++)
for(n = 0; n < avx_iters; n++)
{
__builtin_prefetch(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__builtin_prefetch(&local_code_chip_index[8], 1, 3);
@ -480,7 +494,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 8; ++k)
for(k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -488,9 +502,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
}
}
_mm256_zeroupper();
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(unsigned int n = avx_iters * 8; n < num_points; n++)
for(n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -530,13 +544,15 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec);
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
int current_correlator_tap;
unsigned int n;
unsigned int k;
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(unsigned int n = 0; n < neon_iters; n++)
for(n = 0; n < neon_iters; n++)
{
__builtin_prefetch(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__builtin_prefetch(&local_code_chip_index[4]);
@ -564,13 +580,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = vaddq_f32(indexn, fours);
}
for(unsigned int n = neon_iters * 4; n < num_points; n++)
for(n = neon_iters * 4; n < num_points; n++)
{
__builtin_prefetch(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap

View File

@ -74,9 +74,11 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t
{
int local_code_chip_index;
//fesetround(FE_TONEAREST);
for (int current_vector = 0; current_vector < num_out_vectors; current_vector++)
int current_vector;
unsigned int n;
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{
for (unsigned int n = 0; n < num_output_samples; n++)
for (n = 0; n < num_output_samples; n++)
{
// resample code for current tap
local_code_chip_index = round(code_phase_step_chips * (float)(n) + rem_code_phase_chips[current_vector] - 0.5f);

View File

@ -560,7 +560,8 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
static inline void volk_gnsssdr_32f_sincos_32fc_generic(lv_32fc_t* out, const float* in, unsigned int num_points)
{
float _in;
for(unsigned int i = 0; i < num_points; i++)
unsigned int i;
for(i = 0; i < num_points; i++)
{
_in = *in++;
*out++ = lv_cmake((float)cos(_in), (float)sin(_in) );
@ -584,8 +585,8 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
const int32_t Nbits = 10;
const int32_t diffbits = bitlength - Nbits;
uint32_t ux;
for(unsigned int i = 0; i < num_points; i++)
unsigned int i;
for(i = 0; i < num_points; i++)
{
_in = *in++;
d = (int32_t)floor(_in / TWO_PI + 0.5);

View File

@ -70,7 +70,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux;
unsigned int i;
const float min_val = (float)SHRT_MIN;
const float max_val = (float)SHRT_MAX;
@ -80,7 +80,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0; i < sse_iters; i++)
for(i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -99,7 +99,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
outputVectorPtr += 8;
}
for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
for(i = sse_iters * 8; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
@ -122,6 +122,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux;
unsigned int i;
const float min_val = (float)SHRT_MIN;
const float max_val = (float)SHRT_MAX;
@ -132,7 +133,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++)
for(i = 0;i < sse_iters; i++)
{
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -151,7 +152,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
outputVectorPtr += 8;
}
for(unsigned int i = sse_iters * 8; i < num_points*2; i++)
for(i = sse_iters * 8; i < num_points*2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
@ -174,6 +175,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux;
unsigned int i;
const float min_val = (float)SHRT_MIN;
const float max_val = (float)SHRT_MAX;
@ -184,7 +186,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0; i < sse_iters; i++)
for(i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -203,7 +205,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
outputVectorPtr += 8;
}
for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
for(i = sse_iters * 8; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
@ -225,7 +227,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
const float min_val = (float)SHRT_MIN;
const float max_val = (float)SHRT_MAX;
float aux;
unsigned int i;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
@ -235,7 +237,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++)
for(i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -254,7 +256,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
outputVectorPtr += 8;
}
for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
for(i = sse_iters * 8; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
@ -280,7 +282,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
const float min_val_f = (float)SHRT_MIN;
const float max_val_f = (float)SHRT_MAX;
float32_t aux;
unsigned int i;
const float32x4_t min_val = vmovq_n_f32(min_val_f);
const float32x4_t max_val = vmovq_n_f32(max_val_f);
float32x4_t half = vdupq_n_f32(0.5f);
@ -290,7 +292,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
int16x4_t intInputVal1, intInputVal2;
int16x8_t res;
for(unsigned int i = 0; i < neon_iters; i++)
for(i = 0; i < neon_iters; i++)
{
a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
@ -318,7 +320,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
outputVectorPtr += 8;
}
for(unsigned int i = neon_iters * 8; i < num_points * 2; i++)
for(i = neon_iters * 8; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val_f)
@ -341,8 +343,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
const float min_val = (float)SHRT_MIN;
const float max_val = (float)SHRT_MAX;
float aux;
for(unsigned int i = 0; i < num_points * 2; i++)
unsigned int i;
for(i = 0; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)

View File

@ -71,8 +71,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
const float min_val = (float)SCHAR_MIN;
const float max_val = (float)SCHAR_MAX;
float aux;
for(unsigned int i = 0; i < num_points * 2; i++)
unsigned int i;
for(i = 0; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
@ -98,6 +98,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
const float min_val = (float)SCHAR_MIN;
const float max_val = (float)SCHAR_MAX;
float aux;
unsigned int i;
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
@ -106,7 +107,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0; i < sse_iters; i++)
for(i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -137,7 +138,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
outputVectorPtr += 16;
}
for(unsigned int i = sse_iters * 16; i < num_points * 2; i++)
for(i = sse_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
@ -163,6 +164,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
const float min_val = (float)SCHAR_MIN;
const float max_val = (float)SCHAR_MAX;
float aux;
unsigned int i;
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
@ -171,7 +173,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0; i < sse_iters; i++)
for(i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -202,7 +204,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
outputVectorPtr += 16;
}
for(unsigned int i = sse_iters * 16; i < num_points * 2; i++)
for(i = sse_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
@ -227,6 +229,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
const float32_t max_val_f = (float32_t)SCHAR_MAX;
const float32_t min_val_f = (float32_t)SCHAR_MIN;
float32_t aux;
unsigned int i;
const float32x4_t min_val = vmovq_n_f32(min_val_f);
const float32x4_t max_val = vmovq_n_f32(max_val_f);
@ -240,7 +243,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
int8x8_t res8_1, res8_2;
int8x16_t outputVal;
for(unsigned int i = 0; i < neon_iters; i++)
for(i = 0; i < neon_iters; i++)
{
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
a = vmulq_f32(a, max_val);
@ -290,7 +293,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
outputVectorPtr += 16;
}
for(unsigned int i = neon_iters * 16; i < num_points * 2; i++)
for(i = neon_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val_f;
if(aux > max_val_f)

View File

@ -50,11 +50,11 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -63,7 +63,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -80,11 +80,11 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* re
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -93,7 +93,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* re
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -109,11 +109,11 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -122,7 +122,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -139,11 +139,11 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t*
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -152,7 +152,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t*
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -168,11 +168,11 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t*
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -181,7 +181,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t*
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -197,11 +197,11 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* res
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -210,7 +210,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* res
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -226,11 +226,11 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -239,7 +239,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
@ -255,11 +255,11 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* resu
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
@ -268,7 +268,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* resu
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(unsigned int n = 0; n < num_out_vectors; n++)
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}

View File

@ -81,11 +81,13 @@
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_32fc_t tmp32_1, tmp32_2;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
int n_vec;
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
}
for (unsigned int n = 0; n < num_points; n++)
for (n = 0; n < num_points; n++)
{
tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
@ -102,7 +104,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
}
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp32_2 = tmp32_1 * in_a[n_vec][n];
result[n_vec] += tmp32_2;
@ -119,18 +121,21 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
{
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int ROTATOR_RELOAD = 256;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
int n_vec;
unsigned int n;
unsigned int j;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
}
for (unsigned int n = 0; n < num_points / ROTATOR_RELOAD; n++)
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
{
for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
for (j = 0; j < ROTATOR_RELOAD; j++)
{
tmp32_1 = *in_common++ * (*phase);
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp32_2 = tmp32_1 * in_a[n_vec][n * ROTATOR_RELOAD + j];
result[n_vec] += tmp32_2;
@ -145,11 +150,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
#endif
}
for (unsigned int j = 0; j < num_points % ROTATOR_RELOAD; j++)
for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
{
tmp32_1 = *in_common++ * (*phase);
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp32_2 = tmp32_1 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j];
result[n_vec] += tmp32_2;
@ -167,7 +172,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
lv_32fc_t dotProduct = lv_cmake(0,0);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2;
int n_vec;
int i;
unsigned int number;
unsigned int n;
const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common;
@ -175,7 +183,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
acc[n_vec] = _mm_setzero_ps();
}
@ -195,7 +203,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
// Phase rotation on operand in_common starts here:
a = _mm_loadu_ps((float*)_in_common);
@ -217,7 +225,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
//next two samples
_in_common += 2;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_loadu_ps((float*)&(_in_a[n_vec][number*2]));
tmp1 = _mm_mul_ps(a, yl);
@ -237,11 +245,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 2; ++i)
for (i = 0; i < 2; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
}
@ -252,11 +260,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for(unsigned int n = sse_iters * 2; n < num_points; n++)
for(n = sse_iters * 2; n < num_points; n++)
{
tmp32_1 = in_common[n] * (*phase);
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp32_2 = tmp32_1 * in_a[n_vec][n];
result[n_vec] += tmp32_2;
@ -273,7 +281,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
lv_32fc_t dotProduct = lv_cmake(0,0);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2;
int n_vec;
int i;
unsigned int n;
unsigned int number;
const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common;
@ -281,7 +292,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
acc[n_vec] = _mm_setzero_ps();
}
@ -301,7 +312,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
// Phase rotation on operand in_common starts here:
a = _mm_load_ps((float*)_in_common);
@ -323,7 +334,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
//next two samples
_in_common += 2;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_load_ps((float*)&(_in_a[n_vec][number*2]));
tmp1 = _mm_mul_ps(a, yl);
@ -343,11 +354,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 2; ++i)
for (i = 0; i < 2; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
}
@ -358,11 +369,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for(unsigned int n = sse_iters * 2; n < num_points; n++)
for(n = sse_iters * 2; n < num_points; n++)
{
tmp32_1 = in_common[n] * (*phase);
(*phase) *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp32_2 = tmp32_1 * in_a[n_vec][n];
result[n_vec] += tmp32_2;
@ -379,7 +390,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
lv_32fc_t dotProduct = lv_cmake(0,0);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4;
int n_vec;
int i;
unsigned int number;
unsigned int n;
const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common;
lv_32fc_t _phase = (*phase);
@ -388,7 +402,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
acc[n_vec] = _mm256_setzero_ps();
result[n_vec] = lv_cmake(0, 0);
@ -417,7 +431,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
for(unsigned int number = 0; number < avx_iters; number++)
for(number = 0; number < avx_iters; number++)
{
// Phase rotation on operand in_common starts here:
a = _mm256_loadu_ps((float*)_in_common);
@ -439,7 +453,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
//next two samples
_in_common += 4;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm256_loadu_ps((float*)&(_in_a[n_vec][number * 4]));
tmp1 = _mm256_mul_ps(a, yl);
@ -459,11 +473,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
}
@ -481,11 +495,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
_phase = four_phase_acc[0];
_mm256_zeroupper();
for(unsigned int n = avx_iters * 4; n < num_points; n++)
for(n = avx_iters * 4; n < num_points; n++)
{
tmp32_1 = *_in_common++ * _phase;
_phase *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp32_2 = tmp32_1 * _in_a[n_vec][n];
result[n_vec] += tmp32_2;
@ -503,7 +517,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
lv_32fc_t dotProduct = lv_cmake(0,0);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4;
int n_vec;
int i;
unsigned int number;
unsigned int n;
const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common;
lv_32fc_t _phase = (*phase);
@ -512,7 +529,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
acc[n_vec] = _mm256_setzero_ps();
result[n_vec] = lv_cmake(0, 0);
@ -521,7 +538,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
// phase rotation registers
__m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;
__attribute__((aligned(32))) lv_32fc_t four_phase_inc[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4];
const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
@ -531,7 +548,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
four_phase_inc[3] = phase_inc4;
const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);
__attribute__((aligned(32))) lv_32fc_t four_phase_acc[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4];
four_phase_acc[0] = _phase;
four_phase_acc[1] = _phase * phase_inc;
four_phase_acc[2] = _phase * phase_inc2;
@ -541,7 +558,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
for(unsigned int number = 0; number < avx_iters; number++)
for(number = 0; number < avx_iters; number++)
{
// Phase rotation on operand in_common starts here:
a = _mm256_load_ps((float*)_in_common);
@ -563,7 +580,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
//next two samples
_in_common += 4;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm256_load_ps((float*)&(_in_a[n_vec][number * 4]));
tmp1 = _mm256_mul_ps(a, yl);
@ -583,11 +600,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
}
@ -605,11 +622,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
_phase = four_phase_acc[0];
_mm256_zeroupper();
for(unsigned int n = avx_iters * 4; n < num_points; n++)
for(n = avx_iters * 4; n < num_points; n++)
{
tmp32_1 = *_in_common++ * _phase;
_phase *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp32_2 = tmp32_1 * _in_a[n_vec][n];
result[n_vec] += tmp32_2;
@ -626,7 +643,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
const unsigned int neon_iters = num_points / 4;
int n_vec;
int i;
unsigned int number;
unsigned int n ;
const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common;
lv_32fc_t* _out = result;
@ -665,7 +685,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
float32x4x2_t* accumulator1 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
float32x4x2_t* accumulator2 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
for(int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator1[n_vec].val[0] = vdupq_n_f32(0.0f);
accumulator1[n_vec].val[1] = vdupq_n_f32(0.0f);
@ -673,7 +693,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
accumulator2[n_vec].val[1] = vdupq_n_f32(0.0f);
}
for(unsigned int number = 0; number < neon_iters; number++)
for(number = 0; number < neon_iters; number++)
{
/* load 4 complex numbers (float 32 bits each component) */
b_val = vld2q_f32((float32_t*)_in_common);
@ -715,7 +735,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
_phase_imag = vld1q_f32(____phase_imag);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2q_f32((float32_t*)&(_in_a[n_vec][number * 4]));
@ -726,16 +746,16 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
accumulator2[n_vec].val[1] = vmlaq_f32(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]);
}
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator1[n_vec].val[0] = vaddq_f32(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]);
accumulator1[n_vec].val[1] = vaddq_f32(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]);
}
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
for (int i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
}
@ -750,11 +770,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
_phase = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
}
for(unsigned int n = neon_iters * 4; n < num_points; n++)
for(n = neon_iters * 4; n < num_points; n++)
{
tmp32_1 = in_common[n] * _phase;
_phase *= phase_inc;
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
tmp32_2 = tmp32_1 * in_a[n_vec][n];
_out[n_vec] += tmp32_2;

View File

@ -50,17 +50,17 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -80,17 +80,17 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -110,17 +110,17 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -140,17 +140,17 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -170,17 +170,17 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -200,17 +200,17 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -230,17 +230,17 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32f
phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
lv_32fc_t phase_inc[1];
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
for(unsigned int n = 0; n < num_a_vectors; n++)
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}

View File

@ -74,9 +74,11 @@
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{
int local_code_chip_index;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
int current_correlator_tap;
int n;
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for (int n = 0; n < num_points; n++)
for (n = 0; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -97,7 +99,9 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
{
lv_32fc_t** _result = result;
const unsigned int quarterPoints = num_points / 4;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m128 ones = _mm_set1_ps(1.0f);
const __m128 fours = _mm_set1_ps(4.0f);
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
@ -112,12 +116,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
__m128i local_code_chip_index_reg, aux_i, negatives, i;
__m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(unsigned int n = 0; n < quarterPoints; n++)
for(n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -138,13 +142,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
for(n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -165,7 +169,9 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
{
lv_32fc_t** _result = result;
const unsigned int quarterPoints = num_points / 4;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m128 ones = _mm_set1_ps(1.0f);
const __m128 fours = _mm_set1_ps(4.0f);
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
@ -180,12 +186,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
__m128i local_code_chip_index_reg, aux_i, negatives, i;
__m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(unsigned int n = 0; n < quarterPoints; n++)
for(n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -206,13 +212,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
for(n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -232,7 +238,9 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
{
lv_32fc_t** _result = result;
const unsigned int quarterPoints = num_points / 4;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m128 fours = _mm_set1_ps(4.0f);
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
@ -246,12 +254,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
__m128i local_code_chip_index_reg, aux_i, negatives, i;
__m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(unsigned int n = 0; n < quarterPoints; n++)
for(n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -269,13 +277,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
for(n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -296,7 +304,9 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
{
lv_32fc_t** _result = result;
const unsigned int quarterPoints = num_points / 4;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m128 fours = _mm_set1_ps(4.0f);
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
@ -310,12 +320,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
__m128i local_code_chip_index_reg, aux_i, negatives, i;
__m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(unsigned int n = 0; n < quarterPoints; n++)
for(n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -333,13 +343,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
for(n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -360,7 +370,9 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
{
lv_32fc_t** _result = result;
const unsigned int avx_iters = num_points / 8;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m256 eights = _mm256_set1_ps(8.0f);
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
@ -375,12 +387,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
__m256i local_code_chip_index_reg, i;
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(unsigned int n = 0; n < avx_iters; n++)
for(n = 0; n < avx_iters; n++)
{
__builtin_prefetch(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__builtin_prefetch(&local_code_chip_index[8], 1, 3);
@ -404,7 +416,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 8; ++k)
for(k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -412,9 +424,9 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
}
}
_mm256_zeroupper();
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(unsigned int n = avx_iters * 8; n < num_points; n++)
for(n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -435,7 +447,9 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
{
lv_32fc_t** _result = result;
const unsigned int avx_iters = num_points / 8;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m256 eights = _mm256_set1_ps(8.0f);
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
@ -450,12 +464,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
__m256i local_code_chip_index_reg, i;
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(unsigned int n = 0; n < avx_iters; n++)
for(n = 0; n < avx_iters; n++)
{
__builtin_prefetch(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__builtin_prefetch(&local_code_chip_index[8], 1, 3);
@ -479,7 +493,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 8; ++k)
for(k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -487,9 +501,9 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
}
}
_mm256_zeroupper();
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(unsigned int n = avx_iters * 8; n < num_points; n++)
for(n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -511,6 +525,9 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
{
lv_32fc_t** _result = result;
const unsigned int neon_iters = num_points / 4;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const int32x4_t ones = vdupq_n_s32(1);
const float32x4_t fours = vdupq_n_f32(4.0f);
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
@ -531,12 +548,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec);
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(unsigned int n = 0; n < neon_iters; n++)
for(n = 0; n < neon_iters; n++)
{
__builtin_prefetch(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__builtin_prefetch(&local_code_chip_index[4]);
@ -564,13 +581,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(unsigned int k = 0; k < 4; ++k)
for(k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = vaddq_f32(indexn, fours);
}
for(unsigned int n = neon_iters * 4; n < num_points; n++)
for(n = neon_iters * 4; n < num_points; n++)
{
__builtin_prefetch(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap

View File

@ -65,14 +65,15 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
{
double returnValue = 0;
const unsigned int sse_iters = num_points / 4;
unsigned int number;
unsigned int i;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
aVal = _mm256_loadu_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal);
@ -81,12 +82,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
_mm256_storeu_pd((double*)tempBuffer, accumulator);
for(unsigned int i = 0; i < 4; ++i)
for(i = 0; i < 4; ++i)
{
returnValue += tempBuffer[i];
}
for(unsigned int i = 0; i < (num_points % 4); ++i)
for(i = 0; i < (num_points % 4); ++i)
{
returnValue += (*aPtr++);
}
@ -103,14 +104,15 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
{
double returnValue = 0;
const unsigned int sse_iters = num_points / 2;
unsigned int number;
unsigned int i;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
aVal = _mm_loadu_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal);
@ -119,12 +121,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
_mm_storeu_pd((double*)tempBuffer, accumulator);
for(unsigned int i = 0; i < 2; ++i)
for(i = 0; i < 2; ++i)
{
returnValue += tempBuffer[i];
}
for(unsigned int i = 0; i < (num_points % 2); ++i)
for(i = 0; i < (num_points % 2); ++i)
{
returnValue += (*aPtr++);
}
@ -140,8 +142,9 @@ static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const
{
const double* aPtr = inputBuffer;
double returnValue = 0;
unsigned int number;
for(unsigned int number = 0;number < num_points; number++)
for(number = 0; number < num_points; number++)
{
returnValue += (*aPtr++);
}
@ -157,14 +160,15 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
{
double returnValue = 0;
const unsigned int sse_iters = num_points / 4;
unsigned int number;
unsigned int i;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
aVal = _mm256_load_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal);
@ -173,12 +177,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
_mm256_store_pd((double*)tempBuffer, accumulator);
for(unsigned int i = 0; i < 4; ++i)
for(i = 0; i < 4; ++i)
{
returnValue += tempBuffer[i];
}
for(unsigned int i = 0; i < (num_points % 4); ++i)
for(i = 0; i < (num_points % 4); ++i)
{
returnValue += (*aPtr++);
}
@ -195,14 +199,15 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const
{
double returnValue = 0;
const unsigned int sse_iters = num_points / 2;
unsigned int number;
unsigned int i;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
aVal = _mm_load_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal);
@ -211,12 +216,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const
_mm_store_pd((double*)tempBuffer, accumulator);
for(unsigned int i = 0; i < 2; ++i)
for(i = 0; i < 2; ++i)
{
returnValue += tempBuffer[i];
}
for(unsigned int i = 0; i < (num_points % 2); ++i)
for(i = 0; i < (num_points % 2); ++i)
{
returnValue += (*aPtr++);
}

View File

@ -66,14 +66,15 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
{
char returnValue = 0;
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
__m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
aVal = _mm_lddqu_si128((__m128i*)aPtr);
accumulator = _mm_add_epi8(accumulator, aVal);
@ -81,12 +82,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
}
_mm_storeu_si128((__m128i*)tempBuffer, accumulator);
for(unsigned int i = 0; i < 16; ++i)
for(i = 0; i < 16; ++i)
{
returnValue += tempBuffer[i];
}
for(unsigned int i = 0; i < (num_points % 16); ++i)
for(i = 0; i < (num_points % 16); ++i)
{
returnValue += (*aPtr++);
}
@ -102,8 +103,8 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const c
{
const char* aPtr = inputBuffer;
char returnValue = 0;
for(unsigned int number = 0;number < num_points; number++)
unsigned int number;
for(number = 0;number < num_points; number++)
{
returnValue += (*aPtr++);
}
@ -119,6 +120,8 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
{
char returnValue = 0;
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
const char* aPtr = inputBuffer;
@ -126,7 +129,7 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
__m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
aVal = _mm_load_si128((__m128i*)aPtr);
accumulator = _mm_add_epi8(accumulator, aVal);
@ -134,12 +137,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
}
_mm_store_si128((__m128i*)tempBuffer,accumulator);
for(unsigned int i = 0; i < 16; ++i)
for(i = 0; i < 16; ++i)
{
returnValue += tempBuffer[i];
}
for(unsigned int i = 0; i < (num_points % 16); ++i)
for(i = 0; i < (num_points % 16); ++i)
{
returnValue += (*aPtr++);
}

View File

@ -66,7 +66,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
@ -78,7 +79,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
ones = _mm256_set1_epi8(0xFF);
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
@ -95,7 +96,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
{
_mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
for(unsigned int i = 0; i < 32; i++)
for(i = 0; i < 32; i++)
{
if(currentValuesBuffer[i] > max)
{
@ -109,7 +110,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
inputPtr += 32;
}
for(unsigned int i = 0; i<(num_points % 32); ++i)
for(i = 0; i<(num_points % 32); ++i)
{
if(src0[i] > max)
{
@ -132,7 +133,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i = 0;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
@ -142,7 +144,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
@ -152,7 +154,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
{
_mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
for(unsigned int i = 0; i < 16; i++)
for(i = 0; i < 16; i++)
{
if(currentValuesBuffer[i] > max)
{
@ -166,7 +168,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
inputPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i)
for(i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
@ -189,7 +191,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
@ -200,7 +203,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -210,7 +213,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
{
_mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
mask = ~mask;
unsigned int i = 0;
i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
@ -229,7 +232,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
inputPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i)
for(i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
@ -252,8 +255,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c
{
char max = src0[0];
unsigned int index = 0;
for(unsigned int i = 1; i < num_points; ++i)
unsigned int i;
for(i = 1; i < num_points; ++i)
{
if(src0[i] > max)
{
@ -276,7 +279,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
@ -288,7 +292,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
ones = _mm256_set1_epi8(0xFF);
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm256_load_si256((__m256i*)inputPtr);
@ -305,7 +309,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
{
_mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
for(unsigned int i = 0; i < 32; i++)
for(i = 0; i < 32; i++)
{
if(currentValuesBuffer[i] > max)
{
@ -319,7 +323,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
inputPtr += 32;
}
for(unsigned int i = 0; i<(num_points % 32); ++i)
for(i = 0; i<(num_points % 32); ++i)
{
if(src0[i] > max)
{
@ -342,7 +346,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
@ -352,7 +357,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
@ -362,7 +367,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
{
_mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
for(unsigned int i = 0; i < 16; i++)
for(i = 0; i < 16; i++)
{
if(currentValuesBuffer[i] > max)
{
@ -376,7 +381,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
inputPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i)
for(i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
@ -399,7 +404,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
@ -410,7 +416,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -420,7 +426,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
{
_mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
mask = ~mask;
unsigned int i = 0;
i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
@ -439,7 +445,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
inputPtr += 16;
}
for(unsigned int i = 0; i<(num_points % 16); ++i)
for(i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{

View File

@ -66,7 +66,8 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
@ -74,7 +75,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -84,7 +85,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
for(unsigned int i = 0; i<16; ++i)
for(i = 0; i < 16; ++i)
{
if(maxValuesBuffer[i] > max)
{
@ -92,7 +93,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
}
}
for(unsigned int i = sse_iters * 16; i< num_points; ++i)
for(i = sse_iters * 16; i < num_points; ++i)
{
if(src0[i] > max)
{
@ -114,7 +115,8 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned short mask;
@ -123,7 +125,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -133,7 +135,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
{
_mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
mask = ~mask;
int i = 0;
i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
@ -151,7 +153,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
inputPtr += 16;
}
for(unsigned int i = sse_iters * 16; i < num_points; ++i)
for(i = sse_iters * 16; i < num_points; ++i)
{
if(src0[i] > max)
{
@ -172,8 +174,8 @@ static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src
if(num_points > 0)
{
char max = src0[0];
for(unsigned int i = 1; i < num_points; ++i)
unsigned int i;
for(i = 1; i < num_points; ++i)
{
if(src0[i] > max)
{
@ -195,7 +197,8 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
@ -203,7 +206,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -213,7 +216,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
for(unsigned int i = 0; i<16; ++i)
for(i = 0; i < 16; ++i)
{
if(maxValuesBuffer[i] > max)
{
@ -221,7 +224,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
}
}
for(unsigned int i = sse_iters * 16; i < num_points; ++i)
for(i = sse_iters * 16; i < num_points; ++i)
{
if(src0[i] > max)
{
@ -243,7 +246,8 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
if(num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned short mask;
@ -252,7 +256,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -280,7 +284,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
inputPtr += 16;
}
for(unsigned int i = sse_iters * 16; i < num_points; ++i)
for(i = sse_iters * 16; i < num_points; ++i)
{
if(src0[i] > max)
{

View File

@ -64,14 +64,15 @@
static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr = bVector;
__m128i aVal, bVal, cVal;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
aVal = _mm_loadu_si128((__m128i*)aPtr);
bVal = _mm_loadu_si128((__m128i*)bPtr);
@ -85,7 +86,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
cPtr += 16;
}
for(unsigned int i = sse_iters * 16; i < num_points; ++i)
for(i = sse_iters * 16; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
@ -100,7 +101,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char*
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr = bVector;
unsigned int number = 0;
unsigned int number;
for(number = 0; number < num_points; number++)
{
@ -116,14 +117,15 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char*
static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr = bVector;
__m128i aVal, bVal, cVal;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
aVal = _mm_load_si128((__m128i*)aPtr);
bVal = _mm_load_si128((__m128i*)bPtr);
@ -137,7 +139,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
cPtr += 16;
}
for(unsigned int i = sse_iters * 16; i < num_points; ++i)
for(i = sse_iters * 16; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}

View File

@ -65,7 +65,7 @@
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
@ -74,7 +74,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
__m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (unsigned int i = 0; i < sse_iters; ++i)
for (i = 0; i < sse_iters; ++i)
{
tmp = _mm256_loadu_ps((float*)a);
tmp = _mm256_xor_ps(tmp, conjugator1);
@ -90,7 +90,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
c += 16;
}
for (unsigned int i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
@ -104,14 +104,14 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m128i tmp;
__m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (unsigned int i = 0; i < sse_iters; ++i)
for (i = 0; i < sse_iters; ++i)
{
tmp = _mm_lddqu_si128((__m128i*)a);
tmp = _mm_sign_epi8(tmp, conjugator);
@ -120,7 +120,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con
c += 8;
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
@ -135,7 +135,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m128i tmp;
@ -143,7 +143,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons
__m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (unsigned int i = 0; i < sse_iters; ++i)
for (i = 0; i < sse_iters; ++i)
{
tmp = _mm_lddqu_si128((__m128i*)a);
tmp = _mm_xor_si128(tmp, conjugator1);
@ -153,7 +153,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons
c += 8;
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
@ -168,7 +168,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con
{
lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
unsigned int number = 0;
unsigned int number;
for(number = 0; number < num_points; number++)
{
@ -184,7 +184,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
@ -193,7 +193,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
__m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (unsigned int i = 0; i < sse_iters; ++i)
for (i = 0; i < sse_iters; ++i)
{
tmp = _mm256_load_ps((float*)a);
tmp = _mm256_xor_ps(tmp, conjugator1);
@ -209,7 +209,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
c += 16;
}
for (unsigned int i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
@ -223,14 +223,13 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m128i tmp;
__m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (unsigned int i = 0; i < sse_iters; ++i)
for (i = 0; i < sse_iters; ++i)
{
tmp = _mm_load_si128((__m128i*)a);
tmp = _mm_sign_epi8(tmp, conjugator);
@ -239,11 +238,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, con
c += 8;
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_SSSE3 */
@ -254,7 +252,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, con
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m128i tmp;
@ -262,7 +260,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons
__m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (unsigned int i = 0; i < sse_iters; ++i)
for (i = 0; i < sse_iters; ++i)
{
tmp = _mm_load_si128((__m128i*)a);
tmp = _mm_xor_si128(tmp, conjugator1);
@ -272,7 +270,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons
c += 8;
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
@ -297,12 +295,12 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const
static inline void volk_gnsssdr_8ic_conjugate_8ic_neon(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
int8x8x2_t a_val;
for (unsigned int i = 0; i < sse_iters; ++i)
for (i = 0; i < sse_iters; ++i)
{
a_val = vld2_s8((const int8_t*)a);
__builtin_prefetch(a + 16);
@ -312,7 +310,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_neon(lv_8sc_t* cVector, const
c += 8;
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}

View File

@ -65,7 +65,8 @@
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector;
@ -77,7 +78,7 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(unsigned int number = 0;number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero);
@ -104,7 +105,7 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
magnitudeVectorPtr += 16;
}
for (unsigned int i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
const char valReal = *complexVectorPtr++;
const char valImag = *complexVectorPtr++;
@ -160,8 +161,8 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitude
{
const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector;
for(unsigned int number = 0; number < num_points; number++)
unsigned int number;
for(number = 0; number < num_points; number++)
{
const char real = *complexVectorPtr++;
const char imag = *complexVectorPtr++;
@ -180,6 +181,8 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector;
unsigned int number;
unsigned int i;
__m128i zero, result8;
__m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
@ -189,7 +192,7 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(unsigned int number = 0;number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
avector = _mm_load_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero);
@ -216,7 +219,7 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
magnitudeVectorPtr += 16;
}
for (unsigned int i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
const char valReal = *complexVectorPtr++;
const char valImag = *complexVectorPtr++;

View File

@ -116,13 +116,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
{
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
unsigned int number;
unsigned int i;
const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b;
const unsigned int sse_iters = num_points/8;
if (sse_iters>0)
if (sse_iters > 0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
@ -130,7 +131,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b);
@ -168,13 +169,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (int i = 0; i < 8; ++i)
for (i = 0; i < 8; ++i)
{
dotProduct += dotProductVector[i];
}
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
dotProduct += (*a++) * (*b++);
}
@ -192,13 +193,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
{
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
unsigned int number;
unsigned int i;
const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b;
const unsigned int sse_iters = num_points/8;
if (sse_iters>0)
if (sse_iters > 0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
@ -206,7 +208,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
@ -242,13 +244,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (unsigned int i = 0; i < 8; ++i)
for (i = 0; i < 8; ++i)
{
dotProduct += dotProductVector[i];
}
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
dotProduct += (*a++) * (*b++);
}
@ -266,13 +268,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
{
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
unsigned int number;
unsigned int i;
const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b;
const unsigned int sse_iters = num_points/8;
if (sse_iters>0)
if (sse_iters > 0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
@ -280,7 +283,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
@ -318,13 +321,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (unsigned int i = 0; i < 8; ++i)
for (i = 0; i < 8; ++i)
{
dotProduct += dotProductVector[i];
}
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
dotProduct += (*a++) * (*b++);
}
@ -341,7 +344,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
{
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
unsigned int number;
unsigned int i;
const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b;
@ -355,7 +359,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
@ -391,13 +395,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (unsigned int i = 0; i < 8; ++i)
for (i = 0; i < 8; ++i)
{
dotProduct += dotProductVector[i];
}
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
dotProduct += (*a++) * (*b++);
}

View File

@ -66,7 +66,8 @@
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
unsigned int number;
unsigned int i;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
@ -74,7 +75,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b);
@ -107,7 +108,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
c += 8;
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
*c++ = (*a++) * (*b++);
}
@ -121,7 +122,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
unsigned int number;
unsigned int i;
__m128i x, y;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
@ -131,7 +133,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
_mm_setzero_si128();
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
@ -162,7 +164,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
c += 8;
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
*c++ = (*a++) * (*b++);
}
@ -177,8 +179,9 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c
lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
const lv_8sc_t* bPtr = bVector;
unsigned int number;
for(unsigned int number = 0; number < num_points; number++)
for(number = 0; number < num_points; number++)
{
*cPtr++ = (*aPtr++) * (*bPtr++);
}
@ -192,7 +195,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
unsigned int number;
unsigned int i;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
@ -200,7 +204,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
@ -233,7 +237,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
c += 8;
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
*c++ = (*a++) * (*b++);
}
@ -247,7 +251,8 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
unsigned int number;
unsigned int i;
__m128i x, y;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
@ -257,7 +262,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
_mm_setzero_si128();
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
@ -288,7 +293,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
c += 8;
}
for (unsigned int i = sse_iters * 8; i < num_points; ++i)
for (i = sse_iters * 8; i < num_points; ++i)
{
*c++ = (*a++) * (*b++);
}

View File

@ -65,13 +65,15 @@
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
__m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
unsigned char* c = cChar;
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
@ -100,7 +102,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
c += 16;
}
for (unsigned int i = sse_iters * 16; i < num_points ; ++i)
for (i = sse_iters * 16; i < num_points ; ++i)
{
*c++ = (*a++) * (*b++);
}
@ -114,8 +116,9 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar,
unsigned char* cPtr = cChar;
const unsigned char* aPtr = aChar;
const unsigned char* bPtr = bChar;
unsigned int number;
for(unsigned int number = 0; number < num_points; number++)
for(number = 0; number < num_points; number++)
{
*cPtr++ = (*aPtr++) * (*bPtr++);
}
@ -129,13 +132,14 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar,
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
__m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
unsigned char* c = cChar;
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(unsigned int number = 0; number < sse_iters; number++)
for(number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
@ -164,7 +168,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
c += 16;
}
for (unsigned int i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
*c++ = (*a++) * (*b++);
}

View File

@ -385,7 +385,8 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
{
float _phase = (*phase);
for(unsigned int i = 0; i < num_points; i++)
unsigned int i;
for(i = 0; i < num_points; i++)
{
*out++ = lv_cmake((float)cos(_phase), (float)sin(_phase) );
_phase += phase_inc;
@ -402,6 +403,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const f
static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
{
float _in, s, c;
unsigned int i;
int32_t x, sin_index, cos_index, d;
const float PI = 3.14159265358979323846;
const float TWO_TO_THE_31_DIV_PI = 2147483648.0 / PI;
@ -411,7 +413,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
const int32_t diffbits = bitlength - Nbits;
uint32_t ux;
float _phase = (*phase);
for(unsigned int i = 0; i < num_points; i++)
for(i = 0; i < num_points; i++)
{
_in = _phase;
d = (int32_t)floor(_in / TWO_PI + 0.5);