mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-30 23:03:05 +00:00 
			
		
		
		
	fix sse implementations
This commit is contained in:
		| @@ -70,7 +70,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a | |||||||
|             cPtr += 16; |             cPtr += 16; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for(unsigned int i = 0; i<(num_points % 16); ++i) |     for(unsigned int i = sse_iters * 16; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             *cPtr++ = (*aPtr++) + (*bPtr++); |             *cPtr++ = (*aPtr++) + (*bPtr++); | ||||||
|         } |         } | ||||||
| @@ -134,14 +134,14 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a | |||||||
|  |  | ||||||
|             cVal = _mm_add_epi8(aVal, bVal); |             cVal = _mm_add_epi8(aVal, bVal); | ||||||
|  |  | ||||||
|             _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container |             _mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container | ||||||
|  |  | ||||||
|             aPtr += 16; |             aPtr += 16; | ||||||
|             bPtr += 16; |             bPtr += 16; | ||||||
|             cPtr += 16; |             cPtr += 16; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for(unsigned int i = 0; i<(num_points % 16); ++i) |     for(unsigned int i = sse_iters * 16; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             *cPtr++ = (*aPtr++) + (*bPtr++); |             *cPtr++ = (*aPtr++) + (*bPtr++); | ||||||
|         } |         } | ||||||
| @@ -163,7 +163,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char | |||||||
|     const char* bPtr=  bVector; |     const char* bPtr=  bVector; | ||||||
|     unsigned int number = 0; |     unsigned int number = 0; | ||||||
|  |  | ||||||
|     for(number = 0; number < num_points; number++) |     for(; number < num_points; number++) | ||||||
|         { |         { | ||||||
|             *cPtr++ = (*aPtr++) + (*bPtr++); |             *cPtr++ = (*aPtr++) + (*bPtr++); | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -75,7 +75,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const | |||||||
|             c += 16; |             c += 16; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 16); ++i) |     for (unsigned int i = sse_iters * 16; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             *c++ = lv_conj(*a++); |             *c++ = lv_conj(*a++); | ||||||
|         } |         } | ||||||
| @@ -109,7 +109,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con | |||||||
|             c += 8; |             c += 8; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 8); ++i) |     for (unsigned int i = sse_iters * 8; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             *c++ = lv_conj(*a++); |             *c++ = lv_conj(*a++); | ||||||
|         } |         } | ||||||
| @@ -146,7 +146,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons | |||||||
|             c += 8; |             c += 8; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 8); ++i) |     for (unsigned int i = sse_iters * 8; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             *c++ = lv_conj(*a++); |             *c++ = lv_conj(*a++); | ||||||
|         } |         } | ||||||
| @@ -220,7 +220,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const | |||||||
|             c += 16; |             c += 16; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 16); ++i) |     for (unsigned int i = sse_iters * 16; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             *c++ = lv_conj(*a++); |             *c++ = lv_conj(*a++); | ||||||
|         } |         } | ||||||
| @@ -254,7 +254,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, con | |||||||
|             c += 8; |             c += 8; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 8); ++i) |     for (unsigned int i = sse_iters * 8; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             *c++ = lv_conj(*a++); |             *c++ = lv_conj(*a++); | ||||||
|         } |         } | ||||||
| @@ -291,7 +291,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons | |||||||
|             c += 8; |             c += 8; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 8); ++i) |     for (unsigned int i = sse_iters * 8; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             *c++ = lv_conj(*a++); |             *c++ = lv_conj(*a++); | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -90,7 +90,7 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV | |||||||
|             magnitudeVectorPtr += 16; |             magnitudeVectorPtr += 16; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 16); ++i) |     for (unsigned int i = sse_iters * 16; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             const char valReal = *complexVectorPtr++; |             const char valReal = *complexVectorPtr++; | ||||||
|             const char valImag = *complexVectorPtr++; |             const char valImag = *complexVectorPtr++; | ||||||
| @@ -226,7 +226,7 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV | |||||||
|             magnitudeVectorPtr += 16; |             magnitudeVectorPtr += 16; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 16); ++i) |     for (unsigned int i = sse_iters * 16; i < num_points; ++i) | ||||||
|         { |         { | ||||||
|             const char valReal = *complexVectorPtr++; |             const char valReal = *complexVectorPtr++; | ||||||
|             const char valImag = *complexVectorPtr++; |             const char valImag = *complexVectorPtr++; | ||||||
|   | |||||||
| @@ -51,6 +51,7 @@ | |||||||
|  */ |  */ | ||||||
| static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points) | static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points) | ||||||
| { | { | ||||||
|  |     unsigned int number = 0; | ||||||
|     const unsigned int sse_iters = num_points / 8; |     const unsigned int sse_iters = num_points / 8; | ||||||
|  |  | ||||||
|     __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; |     __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; | ||||||
| @@ -65,7 +66,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, | |||||||
|     imagy = _mm_and_si128 (imagy, mult1); |     imagy = _mm_and_si128 (imagy, mult1); | ||||||
|     realy = _mm_and_si128 (y, mult1); |     realy = _mm_and_si128 (y, mult1); | ||||||
|  |  | ||||||
|     for(unsigned int number = 0;number < sse_iters; number++) |     for(; number < sse_iters; number++) | ||||||
|         { |         { | ||||||
|             x = _mm_lddqu_si128((__m128i*)a); |             x = _mm_lddqu_si128((__m128i*)a); | ||||||
|  |  | ||||||
| @@ -92,7 +93,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, | |||||||
|             c += 8; |             c += 8; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 8); ++i) |     for (number = sse_iters * 8; number < num_points; ++number) | ||||||
|         { |         { | ||||||
|             *c++ = (*a++) * scalar; |             *c++ = (*a++) * scalar; | ||||||
|         } |         } | ||||||
| @@ -164,6 +165,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, | |||||||
|  */ |  */ | ||||||
| static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points) | static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points) | ||||||
| { | { | ||||||
|  |     unsigned int number = 0; | ||||||
|     const unsigned int sse_iters = num_points / 8; |     const unsigned int sse_iters = num_points / 8; | ||||||
|  |  | ||||||
|     __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; |     __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; | ||||||
| @@ -178,7 +180,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, | |||||||
|     imagy = _mm_and_si128 (imagy, mult1); |     imagy = _mm_and_si128 (imagy, mult1); | ||||||
|     realy = _mm_and_si128 (y, mult1); |     realy = _mm_and_si128 (y, mult1); | ||||||
|  |  | ||||||
|     for(unsigned int number = 0;number < sse_iters; number++) |     for(; number < sse_iters; number++) | ||||||
|         { |         { | ||||||
|             x = _mm_load_si128((__m128i*)a); |             x = _mm_load_si128((__m128i*)a); | ||||||
|  |  | ||||||
| @@ -205,7 +207,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, | |||||||
|             c += 8; |             c += 8; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     for (unsigned int i = 0; i<(num_points % 8); ++i) |     for (number = sse_iters * 8; number < num_points; ++number) | ||||||
|         { |         { | ||||||
|             *c++ = (*a++) * scalar; |             *c++ = (*a++) * scalar; | ||||||
|         } |         } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Carles Fernandez
					Carles Fernandez