1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-12-15 04:30:33 +00:00

fix sse implementations

This commit is contained in:
Carles Fernandez 2016-01-16 22:39:15 +01:00
parent 38d4d8aa9a
commit 46e3ce5ec2
2 changed files with 37 additions and 30 deletions

View File

@ -46,8 +46,9 @@
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
const unsigned int sse_iters = num_points/4; {
const unsigned int sse_iters = num_points / 4;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector; int16_t* outputVectorPtr = (int16_t*)outputVector;
@ -61,7 +62,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -76,29 +78,30 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(unsigned int i = 0; i < (num_points%4)*2; i++) for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
{ {
if(inputVectorPtr[i] > max_val) if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val; inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val) else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val; inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); *outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#include <xmmintrin.h> #include <xmmintrin.h> // __m64, __m128 ??
/*! /*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer \param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
const unsigned int sse_iters = num_points/4; {
const unsigned int sse_iters = num_points / 4;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector; int16_t* outputVectorPtr = (int16_t*)outputVector;
@ -112,7 +115,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){ for(unsigned int i = 0;i < sse_iters; i++)
{
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -127,15 +131,15 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(unsigned int i = 0; i < (num_points%4)*2; i++) for(unsigned int i = sse_iters * 8; i < num_points*2; i++)
{ {
if(inputVectorPtr[i] > max_val) if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val; inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val) else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val; inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); *outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
} }
} }
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */
@ -147,7 +151,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector; int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768; float min_val = -32768;
@ -178,8 +183,9 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
\param outputVector The 16 bit output data buffer \param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted \param num_points The number of data values to be converted
*/ */
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
const unsigned int sse_iters = num_points/4; {
const unsigned int sse_iters = num_points / 4;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector; int16_t* outputVectorPtr = (int16_t*)outputVector;
@ -193,7 +199,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++) for(unsigned int i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -211,13 +217,13 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(unsigned int i = 0; i < (num_points%4)*2; i++) for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
{ {
if(inputVectorPtr[i] > max_val) if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val; inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val) else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val; inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); *outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -264,13 +270,13 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(unsigned int i = 0; i < (num_points%4)*2; i++) for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
{ {
if(inputVectorPtr[i] > max_val) if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val; inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val) else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val; inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); *outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
} }
} }
#endif /* LV_HAVE_SSE */ #endif /* LV_HAVE_SSE */

View File

@ -49,7 +49,8 @@
*/ */
static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{ {
const unsigned int sse_iters = num_points/8; unsigned i = 0;
const unsigned int sse_iters = num_points * 2 / 16;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector; int8_t* outputVectorPtr = (int8_t*)outputVector;
@ -64,7 +65,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++) for(;i < sse_iters; i++)
{ {
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -90,13 +91,13 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
outputVectorPtr += 16; outputVectorPtr += 16;
} }
for(unsigned int i = 0; i < (num_points%8)*2; i++) for(i = sse_iters * 16; i < num_points * 2; i++)
{ {
if(inputVectorPtr[i] > max_val) if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val; inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val) else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val; inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); *outputVectorPtr++ = (int8_t)rintf(*inputVectorPtr++);
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -115,7 +116,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
float min_val = -128; float min_val = -128;
float max_val = 127; float max_val = 127;
for(unsigned int i = 0; i < num_points*2; i++) for(unsigned int i = 0; i < num_points * 2; i++)
{ {
if(inputVectorPtr[i] > max_val) if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val; inputVectorPtr[i] = max_val;
@ -142,7 +143,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
*/ */
static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{ {
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points / 8;
float* inputVectorPtr = (float*)inputVector; float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector; int8_t* outputVectorPtr = (int8_t*)outputVector;
@ -157,7 +158,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
__m128 vmin_val = _mm_set_ps1(min_val); __m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val); __m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++) for(unsigned int i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
@ -183,13 +184,13 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
outputVectorPtr += 16; outputVectorPtr += 16;
} }
for(unsigned int i = 0; i < (num_points%8)*2; i++) for(unsigned int i = sse_iters * 16; i < num_points * 2; i++)
{ {
if(inputVectorPtr[i] > max_val) if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val; inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val) else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val; inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); *outputVectorPtr++ = (int8_t)rintf(*inputVectorPtr++);
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */