mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-01-18 21:23:02 +00:00
fix sse implementations
This commit is contained in:
parent
38d4d8aa9a
commit
46e3ce5ec2
@ -46,8 +46,9 @@
|
|||||||
\param outputVector The 16 bit output data buffer
|
\param outputVector The 16 bit output data buffer
|
||||||
\param num_points The number of data values to be converted
|
\param num_points The number of data values to be converted
|
||||||
*/
|
*/
|
||||||
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||||
const unsigned int sse_iters = num_points/4;
|
{
|
||||||
|
const unsigned int sse_iters = num_points / 4;
|
||||||
|
|
||||||
float* inputVectorPtr = (float*)inputVector;
|
float* inputVectorPtr = (float*)inputVector;
|
||||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||||
@ -61,7 +62,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
|
|||||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(unsigned int i = 0;i < sse_iters; i++){
|
for(unsigned int i = 0; i < sse_iters; i++)
|
||||||
|
{
|
||||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
|
|
||||||
@ -76,29 +78,30 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
|
|||||||
|
|
||||||
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||||
outputVectorPtr += 8;
|
outputVectorPtr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
if(inputVectorPtr[i] > max_val)
|
if(inputVectorPtr[i] > max_val)
|
||||||
inputVectorPtr[i] = max_val;
|
inputVectorPtr[i] = max_val;
|
||||||
else if(inputVectorPtr[i] < min_val)
|
else if(inputVectorPtr[i] < min_val)
|
||||||
inputVectorPtr[i] = min_val;
|
inputVectorPtr[i] = min_val;
|
||||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
*outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE
|
#ifdef LV_HAVE_SSE
|
||||||
#include <xmmintrin.h>
|
#include <xmmintrin.h> // __m64, __m128 ??
|
||||||
/*!
|
/*!
|
||||||
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
|
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
|
||||||
\param inputVector The floating point input data buffer
|
\param inputVector The floating point input data buffer
|
||||||
\param outputVector The 16 bit output data buffer
|
\param outputVector The 16 bit output data buffer
|
||||||
\param num_points The number of data values to be converted
|
\param num_points The number of data values to be converted
|
||||||
*/
|
*/
|
||||||
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||||
const unsigned int sse_iters = num_points/4;
|
{
|
||||||
|
const unsigned int sse_iters = num_points / 4;
|
||||||
|
|
||||||
float* inputVectorPtr = (float*)inputVector;
|
float* inputVectorPtr = (float*)inputVector;
|
||||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||||
@ -112,7 +115,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
|
|||||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(unsigned int i = 0;i < sse_iters; i++){
|
for(unsigned int i = 0;i < sse_iters; i++)
|
||||||
|
{
|
||||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
|
|
||||||
@ -127,15 +131,15 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
|
|||||||
|
|
||||||
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||||
outputVectorPtr += 8;
|
outputVectorPtr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
for(unsigned int i = sse_iters * 8; i < num_points*2; i++)
|
||||||
{
|
{
|
||||||
if(inputVectorPtr[i] > max_val)
|
if(inputVectorPtr[i] > max_val)
|
||||||
inputVectorPtr[i] = max_val;
|
inputVectorPtr[i] = max_val;
|
||||||
else if(inputVectorPtr[i] < min_val)
|
else if(inputVectorPtr[i] < min_val)
|
||||||
inputVectorPtr[i] = min_val;
|
inputVectorPtr[i] = min_val;
|
||||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
*outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE */
|
#endif /* LV_HAVE_SSE */
|
||||||
@ -147,7 +151,8 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
|
|||||||
\param outputVector The 16 bit output data buffer
|
\param outputVector The 16 bit output data buffer
|
||||||
\param num_points The number of data values to be converted
|
\param num_points The number of data values to be converted
|
||||||
*/
|
*/
|
||||||
static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||||
|
{
|
||||||
float* inputVectorPtr = (float*)inputVector;
|
float* inputVectorPtr = (float*)inputVector;
|
||||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||||
float min_val = -32768;
|
float min_val = -32768;
|
||||||
@ -178,8 +183,9 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
|
|||||||
\param outputVector The 16 bit output data buffer
|
\param outputVector The 16 bit output data buffer
|
||||||
\param num_points The number of data values to be converted
|
\param num_points The number of data values to be converted
|
||||||
*/
|
*/
|
||||||
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
|
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||||
const unsigned int sse_iters = num_points/4;
|
{
|
||||||
|
const unsigned int sse_iters = num_points / 4;
|
||||||
|
|
||||||
float* inputVectorPtr = (float*)inputVector;
|
float* inputVectorPtr = (float*)inputVector;
|
||||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||||
@ -193,7 +199,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
|
|||||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(unsigned int i = 0;i < sse_iters; i++)
|
for(unsigned int i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
@ -211,13 +217,13 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
|
|||||||
outputVectorPtr += 8;
|
outputVectorPtr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
if(inputVectorPtr[i] > max_val)
|
if(inputVectorPtr[i] > max_val)
|
||||||
inputVectorPtr[i] = max_val;
|
inputVectorPtr[i] = max_val;
|
||||||
else if(inputVectorPtr[i] < min_val)
|
else if(inputVectorPtr[i] < min_val)
|
||||||
inputVectorPtr[i] = min_val;
|
inputVectorPtr[i] = min_val;
|
||||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
*outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
@ -264,13 +270,13 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
|
|||||||
outputVectorPtr += 8;
|
outputVectorPtr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(unsigned int i = 0; i < (num_points%4)*2; i++)
|
for(unsigned int i = sse_iters * 8; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
if(inputVectorPtr[i] > max_val)
|
if(inputVectorPtr[i] > max_val)
|
||||||
inputVectorPtr[i] = max_val;
|
inputVectorPtr[i] = max_val;
|
||||||
else if(inputVectorPtr[i] < min_val)
|
else if(inputVectorPtr[i] < min_val)
|
||||||
inputVectorPtr[i] = min_val;
|
inputVectorPtr[i] = min_val;
|
||||||
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
|
*outputVectorPtr++ = (int16_t)rintf(*inputVectorPtr++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE */
|
#endif /* LV_HAVE_SSE */
|
||||||
|
@ -49,7 +49,8 @@
|
|||||||
*/
|
*/
|
||||||
static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points/8;
|
unsigned i = 0;
|
||||||
|
const unsigned int sse_iters = num_points * 2 / 16;
|
||||||
|
|
||||||
float* inputVectorPtr = (float*)inputVector;
|
float* inputVectorPtr = (float*)inputVector;
|
||||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||||
@ -64,7 +65,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
|
|||||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(unsigned int i = 0;i < sse_iters; i++)
|
for(;i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
@ -90,13 +91,13 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
|
|||||||
outputVectorPtr += 16;
|
outputVectorPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(unsigned int i = 0; i < (num_points%8)*2; i++)
|
for(i = sse_iters * 16; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
if(inputVectorPtr[i] > max_val)
|
if(inputVectorPtr[i] > max_val)
|
||||||
inputVectorPtr[i] = max_val;
|
inputVectorPtr[i] = max_val;
|
||||||
else if(inputVectorPtr[i] < min_val)
|
else if(inputVectorPtr[i] < min_val)
|
||||||
inputVectorPtr[i] = min_val;
|
inputVectorPtr[i] = min_val;
|
||||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
*outputVectorPtr++ = (int8_t)rintf(*inputVectorPtr++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
@ -115,7 +116,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
|
|||||||
float min_val = -128;
|
float min_val = -128;
|
||||||
float max_val = 127;
|
float max_val = 127;
|
||||||
|
|
||||||
for(unsigned int i = 0; i < num_points*2; i++)
|
for(unsigned int i = 0; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
if(inputVectorPtr[i] > max_val)
|
if(inputVectorPtr[i] > max_val)
|
||||||
inputVectorPtr[i] = max_val;
|
inputVectorPtr[i] = max_val;
|
||||||
@ -142,7 +143,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
|
|||||||
*/
|
*/
|
||||||
static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points/8;
|
const unsigned int sse_iters = num_points / 8;
|
||||||
|
|
||||||
float* inputVectorPtr = (float*)inputVector;
|
float* inputVectorPtr = (float*)inputVector;
|
||||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||||
@ -157,7 +158,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
|
|||||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(unsigned int i = 0;i < sse_iters; i++)
|
for(unsigned int i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
||||||
@ -183,13 +184,13 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
|
|||||||
outputVectorPtr += 16;
|
outputVectorPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(unsigned int i = 0; i < (num_points%8)*2; i++)
|
for(unsigned int i = sse_iters * 16; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
if(inputVectorPtr[i] > max_val)
|
if(inputVectorPtr[i] > max_val)
|
||||||
inputVectorPtr[i] = max_val;
|
inputVectorPtr[i] = max_val;
|
||||||
else if(inputVectorPtr[i] < min_val)
|
else if(inputVectorPtr[i] < min_val)
|
||||||
inputVectorPtr[i] = min_val;
|
inputVectorPtr[i] = min_val;
|
||||||
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
|
*outputVectorPtr++ = (int8_t)rintf(*inputVectorPtr++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
|
Loading…
Reference in New Issue
Block a user