mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2024-12-14 12:10:34 +00:00
Still some bugs to fix in 16sc dot product. All fixed now.
This commit is contained in:
parent
fd2af02aec
commit
4ba75a3fbe
@ -40,7 +40,6 @@
|
|||||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||||
#include <volk_gnsssdr/saturation_arithmetic.h>
|
#include <volk_gnsssdr/saturation_arithmetic.h>
|
||||||
//#include <stdio.h>
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
@ -56,11 +55,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
|
|||||||
result[0] = lv_cmake((int16_t)0, (int16_t)0);
|
result[0] = lv_cmake((int16_t)0, (int16_t)0);
|
||||||
for (unsigned int n = 0; n < num_points; n++)
|
for (unsigned int n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
//r*a.r - i*a.i, i*a.r + r*a.i
|
|
||||||
lv_16sc_t tmp = in_a[n] * in_b[n];
|
lv_16sc_t tmp = in_a[n] * in_b[n];
|
||||||
result[0] += lv_cmake(sat_adds16i(lv_creal(tmp), lv_creal(tmp)), sat_adds16i(lv_cimag(tmp), lv_cimag(tmp) ));
|
result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) ));
|
||||||
}
|
}
|
||||||
//printf("generic result = %i,%i", lv_creal(result[0]),lv_cimag(result[0]));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /*LV_HAVE_GENERIC*/
|
#endif /*LV_HAVE_GENERIC*/
|
||||||
@ -70,74 +67,73 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
|
|||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
|
||||||
{
|
{
|
||||||
|
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
|
||||||
|
|
||||||
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
|
const unsigned int sse_iters = num_points / 4;
|
||||||
|
|
||||||
const unsigned int sse_iters = num_points / 4;
|
const lv_16sc_t* _in_a = in_a;
|
||||||
|
const lv_16sc_t* _in_b = in_b;
|
||||||
|
lv_16sc_t* _out = out;
|
||||||
|
|
||||||
const lv_16sc_t* _in_a = in_a;
|
if (sse_iters > 0)
|
||||||
const lv_16sc_t* _in_b = in_b;
|
{
|
||||||
lv_16sc_t* _out = out;
|
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result;
|
||||||
|
|
||||||
if (sse_iters > 0)
|
realcacc = _mm_setzero_si128();
|
||||||
{
|
imagcacc = _mm_setzero_si128();
|
||||||
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result;
|
|
||||||
|
|
||||||
realcacc = _mm_setzero_si128();
|
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
||||||
imagcacc = _mm_setzero_si128();
|
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
||||||
|
|
||||||
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
for(unsigned int number = 0; number < sse_iters; number++)
|
||||||
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
{
|
||||||
|
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||||
|
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||||
|
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
|
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
b = _mm_loadu_si128((__m128i*)_in_b);
|
||||||
|
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
for(unsigned int number = 0; number < sse_iters; number++)
|
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
{
|
real = _mm_subs_epi16 (c,c_sr);
|
||||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
|
||||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
|
||||||
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
|
||||||
b = _mm_loadu_si128((__m128i*)_in_b);
|
|
||||||
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
|
||||||
|
|
||||||
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||||
real = _mm_subs_epi16 (c,c_sr);
|
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||||
|
|
||||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic!
|
||||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
|
||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic!
|
realcacc = _mm_adds_epi16 (realcacc, real);
|
||||||
|
imagcacc = _mm_adds_epi16 (imagcacc, imag);
|
||||||
|
|
||||||
realcacc = _mm_adds_epi16 (realcacc, real);
|
_in_a += 4;
|
||||||
imagcacc = _mm_adds_epi16 (imagcacc, imag);
|
_in_b += 4;
|
||||||
|
}
|
||||||
|
|
||||||
_in_a += 4;
|
realcacc = _mm_and_si128 (realcacc, mask_real);
|
||||||
_in_b += 4;
|
imagcacc = _mm_and_si128 (imagcacc, mask_imag);
|
||||||
}
|
|
||||||
|
|
||||||
realcacc = _mm_and_si128 (realcacc, mask_real);
|
result = _mm_or_si128 (realcacc, imagcacc);
|
||||||
imagcacc = _mm_and_si128 (imagcacc, mask_imag);
|
|
||||||
|
|
||||||
result = _mm_or_si128 (realcacc, imagcacc);
|
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
_mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector
|
for (int i = 0; i < 4; ++i)
|
||||||
|
{
|
||||||
|
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < 4; ++i)
|
for (unsigned int i = 0; i < (num_points % 4); ++i)
|
||||||
{
|
{
|
||||||
dotProduct += lv_cmake(sat_adds16i(lv_creal(dotProductVector[i]), lv_creal(dotProductVector[i])), sat_adds16i(lv_cimag(dotProductVector[i]), lv_cimag(dotProductVector[i])));
|
lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
|
||||||
}
|
dotProduct = lv_cmake( sat_adds16i(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16i(lv_cimag(dotProduct), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
|
*_out = dotProduct;
|
||||||
{
|
|
||||||
lv_16sc_t tmp = (*_in_a++) * (*_in_b++);
|
|
||||||
dotProduct += lv_cmake( sat_adds16i(lv_creal(tmp), lv_creal(tmp)), sat_adds16i(lv_cimag(tmp), lv_cimag(tmp)));
|
|
||||||
}
|
|
||||||
|
|
||||||
*_out = dotProduct;
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user