1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-01-19 05:33:02 +00:00

fix sse implementation

This commit is contained in:
Carles Fernandez 2016-01-16 22:48:29 +01:00
parent 46e3ce5ec2
commit 3d3a758ef2

View File

@ -59,7 +59,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0;number < sse_iters; number++) for(unsigned int number = 0; number < sse_iters; number++)
{ {
x = _mm_loadu_si128((__m128i*)a); x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b); y = _mm_loadu_si128((__m128i*)b);
@ -92,7 +92,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
c += 8; c += 8;
} }
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = sse_iters * 8; i < num_points; ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
@ -121,7 +121,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
_mm_setzero_si128(); _mm_setzero_si128();
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0;number < sse_iters; number++) for(unsigned int number = 0; number < sse_iters; number++)
{ {
x = _mm_lddqu_si128((__m128i*)a); x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b); y = _mm_lddqu_si128((__m128i*)b);
@ -152,7 +152,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
c += 8; c += 8;
} }
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = sse_iters * 8; i < num_points; ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
@ -210,7 +210,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0;number < sse_iters; number++) for(unsigned int number = 0; number < sse_iters; number++)
{ {
x = _mm_load_si128((__m128i*)a); x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b); y = _mm_load_si128((__m128i*)b);
@ -243,7 +243,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
c += 8; c += 8;
} }
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = sse_iters * 8; i < num_points; ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
@ -272,7 +272,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
_mm_setzero_si128(); _mm_setzero_si128();
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(unsigned int number = 0;number < sse_iters; number++) for(unsigned int number = 0; number < sse_iters; number++)
{ {
x = _mm_load_si128((__m128i*)a); x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b); y = _mm_load_si128((__m128i*)b);
@ -303,7 +303,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
c += 8; c += 8;
} }
for (unsigned int i = 0; i<(num_points % 8); ++i) for (unsigned int i = sse_iters * 8; i < num_points; ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }