From 497c856437fdbd9402e6410f9c2265276e64922a Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Wed, 20 Jan 2016 18:38:33 +0100 Subject: [PATCH] add unaligned version --- .../volk_gnsssdr_16ic_x2_multiply_16ic.h | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h index 1da3ccbfa..0c6374aea 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h @@ -69,6 +69,57 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + const lv_16sc_t* _in_a = in_a; + const lv_16sc_t* _in_b = in_b; + lv_16sc_t* _out = out; + for(unsigned int number = 0; number < sse_iters; number++) + { + //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] + //imaginery part -> reinterpret_cast(a)[2*i + 1] + // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] + a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_load_si128((__m128i*)_in_b); + c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + + c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm_subs_epi16 (c, c_sr); + real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + + imag = _mm_adds_epi16(imag1, imag2); + imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + + result = _mm_or_si128 (real, imag); + + _mm_store_si128((__m128i*)_out, result); + + _in_a += 4; + _in_b += 4; + _out += 4; + } + + for (unsigned int i = sse_iters * 4; i < num_points; ++i) + { + *_out++ = (*_in_a++) * (*_in_b++); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE2 +#include +static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 4; + __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; + + mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out;