mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2024-11-16 14:54:59 +00:00
Merge branch 'next' of https://github.com/gnss-sdr/gnss-sdr into next
This commit is contained in:
commit
fcae1e0ec5
@ -103,6 +103,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
for(unsigned int number = 0; number < sse_iters; number++)
|
for(unsigned int number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
|
__builtin_prefetch(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
@ -149,7 +150,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
|
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
||||||
(*phase) = two_phase_acc[0];
|
(*phase) = two_phase_acc[0];
|
||||||
|
|
||||||
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
|
for (unsigned int i = sse_iters * 4; i < num_points; ++i)
|
||||||
@ -199,6 +200,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
for(unsigned int number = 0; number < sse_iters; number++)
|
for(unsigned int number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
|
__builtin_prefetch(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
@ -219,7 +221,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
//next two samples
|
//next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__builtin_prefetch(_in + 8);
|
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
|
@ -92,26 +92,22 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
{
|
{
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
//todo dyn mem reg
|
|
||||||
|
|
||||||
__m128i* realcacc;
|
__m128i* realcacc;
|
||||||
__m128i* imagcacc;
|
__m128i* imagcacc;
|
||||||
|
|
||||||
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
||||||
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
||||||
|
|
||||||
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, results;
|
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag;
|
||||||
|
|
||||||
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
||||||
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
||||||
|
|
||||||
for(unsigned int number = 0; number < sse_iters; number++)
|
for(unsigned int number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
|
||||||
|
|
||||||
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
__builtin_prefetch(_in_common + 8);
|
||||||
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
@ -121,13 +117,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm_subs_epi16(c, c_sr);
|
real = _mm_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
||||||
|
|
||||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2);
|
imag = _mm_adds_epi16(c, imag);
|
||||||
|
|
||||||
realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
|
realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
|
||||||
imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
|
imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
|
||||||
@ -141,9 +137,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
|
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
|
||||||
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
|
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
|
||||||
|
|
||||||
results = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)dotProductVector, results); // Store the results back into the dot product vector
|
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0,0);
|
||||||
for (int i = 0; i < 4; ++i)
|
for (int i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
@ -166,7 +162,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
|
|
||||||
@ -195,26 +190,22 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
{
|
{
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
//todo dyn mem reg
|
|
||||||
|
|
||||||
__m128i* realcacc;
|
__m128i* realcacc;
|
||||||
__m128i* imagcacc;
|
__m128i* imagcacc;
|
||||||
|
|
||||||
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
||||||
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
||||||
|
|
||||||
__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, results;
|
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag;
|
||||||
|
|
||||||
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
||||||
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
||||||
|
|
||||||
for(unsigned int number = 0; number < sse_iters; number++)
|
for(unsigned int number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
|
||||||
|
|
||||||
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
__builtin_prefetch(_in_common + 8);
|
||||||
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
@ -224,13 +215,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm_subs_epi16(c, c_sr);
|
real = _mm_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
||||||
|
|
||||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2);
|
imag = _mm_adds_epi16(c, imag);
|
||||||
|
|
||||||
realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
|
realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
|
||||||
imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
|
imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
|
||||||
@ -244,9 +235,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
|
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
|
||||||
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
|
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
|
||||||
|
|
||||||
results = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)dotProductVector, results); // Store the results back into the dot product vector
|
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0,0);
|
||||||
for (int i = 0; i < 4; ++i)
|
for (int i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
@ -269,7 +260,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
|
|
||||||
|
@ -85,6 +85,9 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r
|
|||||||
|
|
||||||
#endif // SSE2
|
#endif // SSE2
|
||||||
|
|
||||||
|
#define WORKAROUND 1
|
||||||
|
#ifdef WORKAROUND
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
@ -105,7 +108,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r
|
|||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
#endif // SSE2
|
#endif // SSE2
|
||||||
|
|
||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
|
@ -115,7 +115,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
||||||
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
||||||
|
|
||||||
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, results;
|
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl;
|
||||||
|
|
||||||
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
||||||
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
||||||
@ -140,6 +140,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
// Phase rotation on operand in_common starts here:
|
// Phase rotation on operand in_common starts here:
|
||||||
//printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
|
//printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
|
__builtin_prefetch(_in_common + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
@ -200,8 +201,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2);
|
imag = _mm_adds_epi16(imag1, imag2);
|
||||||
|
|
||||||
realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real);
|
realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
|
||||||
imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag);
|
imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -210,9 +211,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
|||||||
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
|
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
|
||||||
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
|
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
|
||||||
|
|
||||||
results = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)dotProductVector, results); // Store the results back into the dot product vector
|
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0,0);
|
||||||
for (int i = 0; i < 4; ++i)
|
for (int i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
@ -280,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
|||||||
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
||||||
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
|
||||||
|
|
||||||
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, results;
|
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl;
|
||||||
|
|
||||||
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
|
||||||
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
|
||||||
@ -303,8 +304,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
|||||||
for(unsigned int number = 0; number < sse_iters; number++)
|
for(unsigned int number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
// Phase rotation on operand in_common starts here:
|
// Phase rotation on operand in_common starts here:
|
||||||
|
|
||||||
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
|
__builtin_prefetch(_in_common + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
@ -372,12 +373,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
|||||||
|
|
||||||
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real);
|
realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
|
||||||
imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag);
|
imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
|
||||||
|
|
||||||
results = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)dotProductVector, results); // Store the results back into the dot product vector
|
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0,0);
|
||||||
for (int i = 0; i < 4; ++i)
|
for (int i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
@ -389,7 +390,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
|||||||
free(realcacc);
|
free(realcacc);
|
||||||
free(imagcacc);
|
free(imagcacc);
|
||||||
|
|
||||||
_mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
|
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
||||||
(*phase) = two_phase_acc[0];
|
(*phase) = two_phase_acc[0];
|
||||||
|
|
||||||
for(unsigned int n = sse_iters * 4; n < num_points; n++)
|
for(unsigned int n = sse_iters * 4; n < num_points; n++)
|
||||||
@ -527,7 +528,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
|
|||||||
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__builtin_prefetch(&_in_a[n_vec][number*4] + 8);
|
//__builtin_prefetch(&_in_a[n_vec][number*4] + 8);
|
||||||
|
|
||||||
// multiply the real*real and imag*imag to get real result
|
// multiply the real*real and imag*imag to get real result
|
||||||
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
|
// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
|
||||||
|
@ -116,7 +116,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1
|
|||||||
for(unsigned int n = 0; n < num_a_vectors; n++)
|
for(unsigned int n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy(in_a[n], in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
||||||
|
Loading…
Reference in New Issue
Block a user