diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h index af102dba0..1dac7d877 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h @@ -40,7 +40,6 @@ #include #include #include -//#include #ifdef LV_HAVE_GENERIC @@ -56,11 +55,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, result[0] = lv_cmake((int16_t)0, (int16_t)0); for (unsigned int n = 0; n < num_points; n++) { - //r*a.r - i*a.i, i*a.r + r*a.i lv_16sc_t tmp = in_a[n] * in_b[n]; - result[0] += lv_cmake(sat_adds16i(lv_creal(tmp), lv_creal(tmp)), sat_adds16i(lv_cimag(tmp), lv_cimag(tmp) )); + result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) )); } - //printf("generic result = %i,%i", lv_creal(result[0]),lv_cimag(result[0])); } #endif /*LV_HAVE_GENERIC*/ @@ -70,74 +67,73 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, #include static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) { + lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); - lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); + const unsigned int sse_iters = num_points / 4; - const unsigned int sse_iters = num_points / 4; + const lv_16sc_t* _in_a = in_a; + const lv_16sc_t* _in_b = in_b; + lv_16sc_t* _out = out; - const lv_16sc_t* _in_a = in_a; - const lv_16sc_t* _in_b = in_b; - lv_16sc_t* _out = out; + if (sse_iters > 0) + { + __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result; - if (sse_iters > 0) - { - __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result; + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); - realcacc = _mm_setzero_si128(); - imagcacc = _mm_setzero_si128(); + mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); - mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + for(unsigned int number = 0; number < sse_iters; number++) + { + //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] + //imaginery part -> reinterpret_cast(a)[2*i + 1] + // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] + a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_loadu_si128((__m128i*)_in_b); + c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... - for(unsigned int number = 0; number < sse_iters; number++) - { - //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] - //imaginery part -> reinterpret_cast(a)[2*i + 1] - // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - b = _mm_loadu_si128((__m128i*)_in_b); - c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm_subs_epi16 (c,c_sr); - c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm_subs_epi16 (c,c_sr); + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic! - imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic! + realcacc = _mm_adds_epi16 (realcacc, real); + imagcacc = _mm_adds_epi16 (imagcacc, imag); - realcacc = _mm_adds_epi16 (realcacc, real); - imagcacc = _mm_adds_epi16 (imagcacc, imag); + _in_a += 4; + _in_b += 4; + } - _in_a += 4; - _in_b += 4; - } + realcacc = _mm_and_si128 (realcacc, mask_real); + imagcacc = _mm_and_si128 (imagcacc, mask_imag); - realcacc = _mm_and_si128 (realcacc, mask_real); - imagcacc = _mm_and_si128 (imagcacc, mask_imag); + result = _mm_or_si128 (realcacc, imagcacc); - result = _mm_or_si128 (realcacc, imagcacc); + __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + _mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector - _mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector + for (int i = 0; i < 4; ++i) + { + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + } + } - for (int i = 0; i < 4; ++i) - { - dotProduct += lv_cmake(sat_adds16i(lv_creal(dotProductVector[i]), lv_creal(dotProductVector[i])), sat_adds16i(lv_cimag(dotProductVector[i]), lv_cimag(dotProductVector[i]))); - } - } + for (unsigned int i = 0; i < (num_points % 4); ++i) + { + lv_16sc_t tmp = (*_in_a++) * (*_in_b++); + dotProduct = lv_cmake( sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp))); + } - for (unsigned int i = sse_iters * 4; i < num_points; ++i) - { - lv_16sc_t tmp = (*_in_a++) * (*_in_b++); - dotProduct += lv_cmake( sat_adds16i(lv_creal(tmp), lv_creal(tmp)), sat_adds16i(lv_cimag(tmp), lv_cimag(tmp))); - } - - *_out = dotProduct; + *_out = dotProduct; } #endif /* LV_HAVE_SSE2 */