From e92f409897bdfb7c242360b32e8996584c6ab884 Mon Sep 17 00:00:00 2001 From: Javier Arribas Date: Wed, 20 Jan 2016 15:53:09 +0100 Subject: [PATCH] Added SSE2 unaligned versions of volk_gnss-sdr dot product and resampler kernels. --- .../volk_gnsssdr_16ic_resampler_16ic.h | 109 ++++++++++++++---- .../volk_gnsssdr_16ic_resamplerpuppet_16ic.h | 55 +++------ ...volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h | 38 +++++- .../volk_gnsssdr_16ic_x2_dot_prod_16ic.h | 83 ++++++++++++- .../volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h | 101 +++++++++++++++- ...olk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h | 30 ++++- .../volk_gnsssdr_16ic_x2_multiply_16ic.h | 6 +- .../volk_gnsssdr_16ic_xn_resampler_16ic_xn.h | 105 ++++++++++++++++- 8 files changed, 444 insertions(+), 83 deletions(-) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_16ic.h index 57f45b2f1..0f3a8160a 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_16ic.h @@ -33,8 +33,8 @@ * ------------------------------------------------------------------------- */ -#ifndef INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H -#define INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H +#ifndef INCLUDED_volk_gnsssdr_16ic_resampler_16ic_H +#define INCLUDED_volk_gnsssdr_16ic_resampler_16ic_H #include #include @@ -64,10 +64,8 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_generic(lv_16sc_t* result, c local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f); if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips; if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips; - //std::cout<<"g["<(n) + rem_code_phase_chips-0.5f<<","< -static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) +static inline void volk_gnsssdr_16ic_resampler_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) +{ + _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + unsigned int number; + const unsigned int quarterPoints = num_output_samples / 4; + + lv_16sc_t* _result = result; + + __attribute__((aligned(16))) int local_code_chip_index[4]; + __m128 _rem_code_phase, _code_phase_step_chips; + __m128i _code_length_chips, _code_length_chips_minus1; + __m128 _code_phase_out, _code_phase_out_with_offset; + rem_code_phase_chips = rem_code_phase_chips - 0.5f; + + _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __attribute__((aligned(16))) int four_times_code_length_chips_minus1[4]; + four_times_code_length_chips_minus1[0] = code_length_chips-1; + four_times_code_length_chips_minus1[1] = code_length_chips-1; + four_times_code_length_chips_minus1[2] = code_length_chips-1; + four_times_code_length_chips_minus1[3] = code_length_chips-1; + + __attribute__((aligned(16))) int four_times_code_length_chips[4]; + four_times_code_length_chips[0] = code_length_chips; + four_times_code_length_chips[1] = code_length_chips; + four_times_code_length_chips[2] = code_length_chips; + four_times_code_length_chips[3] = code_length_chips; + + _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + + __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; + + __m128i zero = _mm_setzero_si128(); + + __attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __m128 _4output_index = _mm_load_ps(init_idx_float); + __attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __m128 _4constant_float = _mm_load_ps(init_4constant_float); + + + for(number = 0; number < quarterPoints; number++) + { + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + + _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + + //todo: optimize the local code lookup table with intrinsics, if possible + *_result++ = local_code[local_code_chip_index[0]]; + *_result++ = local_code[local_code_chip_index[1]]; + *_result++ = local_code[local_code_chip_index[2]]; + *_result++ = local_code[local_code_chip_index[3]]; + + _4output_index = _mm_add_ps(_4output_index, _4constant_float); + + } + + for(number = quarterPoints * 4; number < num_output_samples; number++) + { + local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); + if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; + if (local_code_chip_index[0] > (code_length_chips - 1)) local_code_chip_index[0] -= code_length_chips; + *_result++ = local_code[local_code_chip_index[0]]; + } + +} + +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE2 +#include + +static inline void volk_gnsssdr_16ic_resampler_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) { _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; @@ -111,11 +191,9 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, cons __m128i zero = _mm_setzero_si128(); __attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; - __m128 _4output_index = _mm_load_ps(init_idx_float); + __m128 _4output_index = _mm_loadu_ps(init_idx_float); __attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; - __m128 _4constant_float = _mm_load_ps(init_4constant_float); - - //__attribute__((aligned(16))) int output_indexes[4]; + __m128 _4constant_float = _mm_loadu_ps(init_4constant_float); for(number = 0; number < quarterPoints; number++) { @@ -125,7 +203,6 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, cons negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch - //_code_phase_out_int_over=_mm_or_si128(_mm_and_si128(_code_phase_out_int_neg,_code_phase_out_int),_mm_andnot_si128(negative_indexes,_code_phase_out_int)); _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values @@ -134,8 +211,6 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, cons _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back - //_mm_store_ps((float*)_scratch_buffer_float,_code_phase_out_with_offset); - //todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[1]]; @@ -143,7 +218,6 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, cons *_result++ = local_code[local_code_chip_index[3]]; _4output_index = _mm_add_ps(_4output_index, _4constant_float); - //_scratch_buffer_float+=4; } @@ -153,17 +227,10 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, cons if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; if (local_code_chip_index[0] > (code_length_chips - 1)) local_code_chip_index[0] -= code_length_chips; *_result++ = local_code[local_code_chip_index[0]]; - //*_scratch_buffer_float++=code_phase_step_chips*static_cast(number)+rem_code_phase_chips; } - // for(unsigned int n=0;n @@ -42,7 +42,7 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) +static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { float code_phase_step_chips = 0.1; int code_length_chips = 1023; @@ -81,7 +81,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse2(lv_16sc_t* re rem_code_phase_chips[n] = -0.234; result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); } - volk_gnsssdr_16ic_xn_resampler_16ic_xn_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); + volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t)*num_points); volk_gnsssdr_free(rem_code_phase_chips); @@ -94,4 +94,32 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse2(lv_16sc_t* re #endif -#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_a_H +#ifdef LV_HAVE_SSE2 + +static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) +{ + float code_phase_step_chips = 0.1; + int code_length_chips = 1023; + int num_out_vectors = 3; + float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float)* num_out_vectors, volk_gnsssdr_get_alignment()); + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_out_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_out_vectors; n++) + { + rem_code_phase_chips[n] = -0.234; + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); + } + volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); + + memcpy(result, result_aux[0], sizeof(lv_16sc_t)*num_points); + volk_gnsssdr_free(rem_code_phase_chips); + for(unsigned int n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + + +#endif + +#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h index 1dac7d877..3cc180658 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h @@ -33,8 +33,8 @@ * ------------------------------------------------------------------------- */ -#ifndef INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H -#define INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H +#ifndef INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H +#define INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H #include #include @@ -75,6 +75,83 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; + if (sse_iters > 0) + { + __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result; + + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); + + mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + + for(unsigned int number = 0; number < sse_iters; number++) + { + //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] + //imaginery part -> reinterpret_cast(a)[2*i + 1] + // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] + a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_load_si128((__m128i*)_in_b); + c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + + c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm_subs_epi16 (c,c_sr); + + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + + imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic! + + realcacc = _mm_adds_epi16 (realcacc, real); + imagcacc = _mm_adds_epi16 (imagcacc, imag); + + _in_a += 4; + _in_b += 4; + } + + realcacc = _mm_and_si128 (realcacc, mask_real); + imagcacc = _mm_and_si128 (imagcacc, mask_imag); + + result = _mm_or_si128 (realcacc, imagcacc); + + __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + + _mm_store_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector + + for (int i = 0; i < 4; ++i) + { + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + } + } + + for (unsigned int i = 0; i < (num_points % 4); ++i) + { + lv_16sc_t tmp = (*_in_a++) * (*_in_b++); + dotProduct = lv_cmake( sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); + } + + *_out = dotProduct; +} + +#endif /* LV_HAVE_SSE2 */ + + +#ifdef LV_HAVE_SSE2 +#include + +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +{ + lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); + + const unsigned int sse_iters = num_points / 4; + + const lv_16sc_t* _in_a = in_a; + const lv_16sc_t* _in_b = in_b; + lv_16sc_t* _out = out; + if (sse_iters > 0) { __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result; @@ -137,4 +214,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con } #endif /* LV_HAVE_SSE2 */ -#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H*/ +#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h index de8ed2504..0e688abf3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h @@ -33,8 +33,8 @@ * ------------------------------------------------------------------------- */ -#ifndef INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_u_H -#define INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_u_H +#ifndef INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H +#define INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H #include @@ -78,6 +78,101 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out, const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = out; + if (sse_iters > 0) + { + __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + + //todo dyn mem reg + + __m128i* realcacc; + __m128i* imagcacc; + + realcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0 + imagcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0 + + __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; + + mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + + for(unsigned int number = 0; number < sse_iters; number++) + { + //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] + //imaginery part -> reinterpret_cast(a)[2*i + 1] + // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] + + b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + + c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + + c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm_subs_epi16 (c, c_sr); + + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + + imag = _mm_adds_epi16(imag1, imag2); + + realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real); + imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag); + + } + _in_common += 4; + } + + for (int n_vec=0;n_vec + +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_16sc_t dotProduct = lv_cmake(0,0); + + const unsigned int sse_iters = num_points / 4; + + const lv_16sc_t** _in_a = in_a; + const lv_16sc_t* _in_common = in_common; + lv_16sc_t* _out = out; + if (sse_iters > 0) { __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; @@ -160,4 +255,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out, } #endif /* LV_HAVE_SSE2 */ -#endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_u_H*/ +#endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h index 0a37d1376..10d372065 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h @@ -32,8 +32,8 @@ * ------------------------------------------------------------------------- */ -#ifndef INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_H -#define INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_H +#ifndef INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H +#define INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H #include "volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h" #include @@ -42,7 +42,7 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_a_vectors, volk_gnsssdr_get_alignment()); @@ -83,6 +83,28 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r #endif // SSE2 -#endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_H +#ifdef LV_HAVE_SSE2 + +static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + int num_a_vectors = 3; + lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); + memcpy(in_a[n], in, sizeof(lv_16sc_t)*num_points); + } + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // SSE2 + +#endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h index d01b07c24..1da3ccbfa 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h @@ -33,8 +33,8 @@ * ------------------------------------------------------------------------- */ -#ifndef INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_a_H -#define INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_a_H +#ifndef INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H +#define INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H #include #include @@ -110,4 +110,4 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con } #endif /* LV_HAVE_SSE2 */ -#endif /*INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_a_H*/ +#endif /*INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h index 842972782..e1795d866 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h @@ -34,8 +34,8 @@ * ------------------------------------------------------------------------- */ -#ifndef INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_H -#define INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_H +#ifndef INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H +#define INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H #include #include @@ -80,7 +80,99 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) +static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) +{ + _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + unsigned int number; + const unsigned int quarterPoints = num_output_samples / 4; + + lv_16sc_t** _result = result; + __attribute__((aligned(16))) int local_code_chip_index[4]; + float tmp_rem_code_phase_chips; + __m128 _rem_code_phase,_code_phase_step_chips; + __m128i _code_length_chips,_code_length_chips_minus1; + __m128 _code_phase_out,_code_phase_out_with_offset; + + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __attribute__((aligned(16))) int four_times_code_length_chips_minus1[4]; + four_times_code_length_chips_minus1[0] = code_length_chips - 1; + four_times_code_length_chips_minus1[1] = code_length_chips - 1; + four_times_code_length_chips_minus1[2] = code_length_chips - 1; + four_times_code_length_chips_minus1[3] = code_length_chips - 1; + + __attribute__((aligned(16))) int four_times_code_length_chips[4]; + four_times_code_length_chips[0] = code_length_chips; + four_times_code_length_chips[1] = code_length_chips; + four_times_code_length_chips[2] = code_length_chips; + four_times_code_length_chips[3] = code_length_chips; + + _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + + __m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over; + + __m128i zero=_mm_setzero_si128(); + + __attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __m128 _4output_index = _mm_load_ps(init_idx_float); + __attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __m128 _4constant_float = _mm_load_ps(init_4constant_float); + + int current_vector = 0; + int sample_idx = 0; + for(number = 0; number < quarterPoints; number++) + { + //common to all outputs + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + + //output vector dependant (different code phase offset) + for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + { + tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) + _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register + + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + + negative_indexes = _mm_cmplt_epi32 (_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes,_mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + + overflow_indexes = _mm_cmpgt_epi32 (_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + + _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + + //todo: optimize the local code lookup table with intrinsics, if possible + _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; + _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; + _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; + _result[current_vector][sample_idx + 3] = local_code[local_code_chip_index[3]]; + } + _4output_index = _mm_add_ps(_4output_index, _4constant_float); + sample_idx += 4; + } + + for(number = quarterPoints * 4; number < num_output_samples; number++) + { + for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + { + local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); + if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; + if (local_code_chip_index[0] > (code_length_chips - 1)) local_code_chip_index[0] -= code_length_chips; + _result[current_vector][number] = local_code[local_code_chip_index[0]]; + } + + } + +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE2 +#include + +static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; @@ -114,9 +206,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_sse2(lv_16sc_t** resul __m128i zero=_mm_setzero_si128(); __attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; - __m128 _4output_index = _mm_load_ps(init_idx_float); + __m128 _4output_index = _mm_loadu_ps(init_idx_float); __attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; - __m128 _4constant_float = _mm_load_ps(init_4constant_float); + __m128 _4constant_float = _mm_loadu_ps(init_4constant_float); int current_vector = 0; int sample_idx = 0; @@ -167,6 +259,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_sse2(lv_16sc_t** resul } } + #endif /* LV_HAVE_SSE2 */ -#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_H*/ +#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/