mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2024-12-14 12:10:34 +00:00
fix sse implementations
This commit is contained in:
parent
38d4d8aa9a
commit
46e3ce5ec2
@ -46,7 +46,8 @@
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
@ -61,7 +62,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
for(unsigned int i = 0; i < sse_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
@ -78,26 +80,27 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
||||
for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
#include <xmmintrin.h> // __m64, __m128 ??
|
||||
/*!
|
||||
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
@ -112,7 +115,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++){
|
||||
for(unsigned int i = 0;i < sse_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
@ -129,13 +133,13 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
||||
for(unsigned int i = sse_iters * 8; i < num_points*2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
@ -147,7 +151,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||
float min_val = -32768;
|
||||
@ -178,7 +183,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
@ -211,13 +217,13 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
||||
for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
@ -264,13 +270,13 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
||||
for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
@ -49,7 +49,8 @@
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int sse_iters = num_points/8;
|
||||
unsigned i = 0;
|
||||
const unsigned int sse_iters = num_points * 2 / 16;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
@ -64,7 +65,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(unsigned int i = 0;i < sse_iters; i++)
|
||||
for(;i < sse_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||
@ -90,13 +91,13 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%8)*2; i++)
|
||||
for(i = sse_iters * 16; i < num_points * 2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
*outputVectorPtr++ = (int8_t)rintf(*inputVectorPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
@ -183,13 +184,13 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(unsigned int i = 0; i < (num_points%8)*2; i++)
|
||||
for(unsigned int i = sse_iters * 16; i < num_points * 2; i++)
|
||||
{
|
||||
if(inputVectorPtr[i] > max_val)
|
||||
inputVectorPtr[i] = max_val;
|
||||
else if(inputVectorPtr[i] < min_val)
|
||||
inputVectorPtr[i] = min_val;
|
||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
||||
*outputVectorPtr++ = (int8_t)rintf(*inputVectorPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
Loading…
Reference in New Issue
Block a user