From 46e3ce5ec24563e3679c0354421be943c04b325e Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Sat, 16 Jan 2016 22:39:15 +0100 Subject: [PATCH] fix sse implementations --- .../volk_gnsssdr_32fc_convert_16ic.h | 48 +++++++++++-------- .../volk_gnsssdr_32fc_convert_8ic.h | 19 ++++---- 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h index a7c179818..97bb2ded4 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h @@ -46,8 +46,9 @@ \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ - const unsigned int sse_iters = num_points/4; +static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 4; float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; @@ -61,7 +62,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - for(unsigned int i = 0;i < sse_iters; i++){ + for(unsigned int i = 0; i < sse_iters; i++) + { inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; @@ -76,29 +78,30 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 8; - } + } - for(unsigned int i = 0; i < (num_points%4)*2; i++) + for(unsigned int i = sse_iters * 8; i < num_points * 2; i++) { if(inputVectorPtr[i] > max_val) inputVectorPtr[i] = max_val; else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + *outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++); } } #endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_SSE -#include +#include // __m64, __m128 ?? /*! \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) \param inputVector The floating point input data buffer \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ - const unsigned int sse_iters = num_points/4; +static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 4; float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; @@ -112,7 +115,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - for(unsigned int i = 0;i < sse_iters; i++){ + for(unsigned int i = 0;i < sse_iters; i++) + { inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; @@ -127,15 +131,15 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 8; - } + } - for(unsigned int i = 0; i < (num_points%4)*2; i++) + for(unsigned int i = sse_iters * 8; i < num_points*2; i++) { if(inputVectorPtr[i] > max_val) inputVectorPtr[i] = max_val; else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + *outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++); } } #endif /* LV_HAVE_SSE */ @@ -147,7 +151,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; float min_val = -32768; @@ -178,8 +183,9 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto \param outputVector The 16 bit output data buffer \param num_points The number of data values to be converted */ -static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ - const unsigned int sse_iters = num_points/4; +static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 4; float* inputVectorPtr = (float*)inputVector; int16_t* outputVectorPtr = (int16_t*)outputVector; @@ -193,7 +199,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - for(unsigned int i = 0;i < sse_iters; i++) + for(unsigned int i = 0; i < sse_iters; i++) { inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; @@ -211,13 +217,13 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector outputVectorPtr += 8; } - for(unsigned int i = 0; i < (num_points%4)*2; i++) + for(unsigned int i = sse_iters * 8; i < num_points * 2; i++) { if(inputVectorPtr[i] > max_val) inputVectorPtr[i] = max_val; else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + *outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++); } } #endif /* LV_HAVE_SSE2 */ @@ -264,13 +270,13 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, outputVectorPtr += 8; } - for(unsigned int i = 0; i < (num_points%4)*2; i++) + for(unsigned int i = sse_iters * 8; i < num_points * 2; i++) { if(inputVectorPtr[i] > max_val) inputVectorPtr[i] = max_val; else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + *outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++); } } #endif /* LV_HAVE_SSE */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h index 1bea02593..94cc2ecc7 100755 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h @@ -49,7 +49,8 @@ */ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) { - const unsigned int sse_iters = num_points/8; + unsigned i = 0; + const unsigned int sse_iters = num_points * 2 / 16; float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; @@ -64,7 +65,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - for(unsigned int i = 0;i < sse_iters; i++) + for(;i < sse_iters; i++) { inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; @@ -90,13 +91,13 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, outputVectorPtr += 16; } - for(unsigned int i = 0; i < (num_points%8)*2; i++) + for(i = sse_iters * 16; i < num_points * 2; i++) { if(inputVectorPtr[i] > max_val) inputVectorPtr[i] = max_val; else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + *outputVectorPtr++ = (int8_t)rintf(*inputVectorPtr++); } } #endif /* LV_HAVE_SSE2 */ @@ -115,7 +116,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, float min_val = -128; float max_val = 127; - for(unsigned int i = 0; i < num_points*2; i++) + for(unsigned int i = 0; i < num_points * 2; i++) { if(inputVectorPtr[i] > max_val) inputVectorPtr[i] = max_val; @@ -142,7 +143,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, */ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) { - const unsigned int sse_iters = num_points/8; + const unsigned int sse_iters = num_points / 8; float* inputVectorPtr = (float*)inputVector; int8_t* outputVectorPtr = (int8_t*)outputVector; @@ -157,7 +158,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, __m128 vmin_val = _mm_set_ps1(min_val); __m128 vmax_val = _mm_set_ps1(max_val); - for(unsigned int i = 0;i < sse_iters; i++) + for(unsigned int i = 0; i < sse_iters; i++) { inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; @@ -183,13 +184,13 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, outputVectorPtr += 16; } - for(unsigned int i = 0; i < (num_points%8)*2; i++) + for(unsigned int i = sse_iters * 16; i < num_points * 2; i++) { if(inputVectorPtr[i] > max_val) inputVectorPtr[i] = max_val; else if(inputVectorPtr[i] < min_val) inputVectorPtr[i] = min_val; - outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + *outputVectorPtr++ = (int8_t)rintf(*inputVectorPtr++); } } #endif /* LV_HAVE_SSE2 */