mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-01-29 02:14:51 +00:00
Merge branch 'next' of https://github.com/gnss-sdr/gnss-sdr into next
This commit is contained in:
commit
265caeda33
@ -164,6 +164,58 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 8;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||
float aux;
|
||||
unsigned int i;
|
||||
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
|
||||
const float max_val = (float)SHRT_MAX;
|
||||
|
||||
__m256 inputVal1, inputVal2;
|
||||
__m256i intInputVal1, intInputVal2;
|
||||
__m256 ret1, ret2;
|
||||
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
||||
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
||||
|
||||
for(i = 0; i < avx2_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
|
||||
|
||||
// Clip
|
||||
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm256_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm256_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(i = avx2_iters * 16; i < num_points * 2; i++)
|
||||
{
|
||||
aux = *inputVectorPtr++;
|
||||
if(aux > max_val)
|
||||
aux = max_val;
|
||||
else if(aux < min_val)
|
||||
aux = min_val;
|
||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_AVX2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE2
|
||||
#include <emmintrin.h>
|
||||
@ -269,6 +321,59 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 8;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||
float aux;
|
||||
unsigned int i;
|
||||
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
|
||||
const float max_val = (float)SHRT_MAX;
|
||||
|
||||
__m256 inputVal1, inputVal2;
|
||||
__m256i intInputVal1, intInputVal2;
|
||||
__m256 ret1, ret2;
|
||||
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
||||
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
||||
|
||||
for(i = 0; i < avx2_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
|
||||
|
||||
// Clip
|
||||
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm256_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm256_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
|
||||
|
||||
_mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 16;
|
||||
}
|
||||
|
||||
for(i = avx2_iters * 16; i < num_points * 2; i++)
|
||||
{
|
||||
aux = *inputVectorPtr++;
|
||||
if(aux > max_val)
|
||||
aux = max_val;
|
||||
else if(aux < min_val)
|
||||
aux = min_val;
|
||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_AVX2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_NEON
|
||||
#include <arm_neon.h>
|
||||
|
||||
|
@ -85,6 +85,146 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 16;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
|
||||
const float min_val = (float)SCHAR_MIN;
|
||||
const float max_val = (float)SCHAR_MAX;
|
||||
float aux;
|
||||
unsigned int i;
|
||||
|
||||
__m256 inputVal1, inputVal2, inputVal3, inputVal4;
|
||||
__m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
|
||||
__m256i int8InputVal;
|
||||
__m256 ret1, ret2, ret3, ret4;
|
||||
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
||||
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
||||
|
||||
for(i = 0; i < avx2_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
|
||||
|
||||
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
|
||||
inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
|
||||
inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
|
||||
inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
|
||||
|
||||
// Clip
|
||||
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm256_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm256_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm256_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm256_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
|
||||
intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
|
||||
intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
|
||||
int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
|
||||
int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 32;
|
||||
}
|
||||
|
||||
for(i = avx2_iters * 32; i < num_points * 2; i++)
|
||||
{
|
||||
aux = *inputVectorPtr++ * max_val;
|
||||
if(aux > max_val)
|
||||
aux = max_val;
|
||||
else if(aux < min_val)
|
||||
aux = min_val;
|
||||
*outputVectorPtr++ = (int8_t)rintf(aux);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_AVX2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 16;
|
||||
|
||||
float* inputVectorPtr = (float*)inputVector;
|
||||
int8_t* outputVectorPtr = (int8_t*)outputVector;
|
||||
|
||||
const float min_val = (float)SCHAR_MIN;
|
||||
const float max_val = (float)SCHAR_MAX;
|
||||
float aux;
|
||||
unsigned int i;
|
||||
|
||||
__m256 inputVal1, inputVal2, inputVal3, inputVal4;
|
||||
__m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
|
||||
__m256i int8InputVal;
|
||||
__m256 ret1, ret2, ret3, ret4;
|
||||
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
||||
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
||||
|
||||
for(i = 0; i < avx2_iters; i++)
|
||||
{
|
||||
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
|
||||
|
||||
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
|
||||
inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
|
||||
inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
|
||||
inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
|
||||
|
||||
// Clip
|
||||
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
|
||||
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
|
||||
ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
|
||||
ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm256_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm256_cvtps_epi32(ret2);
|
||||
intInputVal3 = _mm256_cvtps_epi32(ret3);
|
||||
intInputVal4 = _mm256_cvtps_epi32(ret4);
|
||||
|
||||
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
|
||||
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
|
||||
intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
|
||||
intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
|
||||
int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
|
||||
int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
|
||||
|
||||
_mm256_store_si256((__m256i*)outputVectorPtr, int8InputVal);
|
||||
outputVectorPtr += 32;
|
||||
}
|
||||
|
||||
for(i = avx2_iters * 32; i < num_points * 2; i++)
|
||||
{
|
||||
aux = *inputVectorPtr++ * max_val;
|
||||
if(aux > max_val)
|
||||
aux = max_val;
|
||||
else if(aux < min_val)
|
||||
aux = min_val;
|
||||
*outputVectorPtr++ = (int8_t)rintf(aux);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_AVX2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE2
|
||||
#include <emmintrin.h>
|
||||
|
||||
|
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||
@ -248,6 +248,64 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = -0.6;
|
||||
int code_length_chips = 1023;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
unsigned int n;
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||
|
||||
for(n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = -0.6;
|
||||
int code_length_chips = 1023;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
unsigned int n;
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||
|
||||
for(n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_NEON
|
||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
|
@ -160,7 +160,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
@ -295,7 +295,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
@ -518,6 +518,162 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t** _result = result;
|
||||
const unsigned int avx_iters = num_points / 8;
|
||||
int current_correlator_tap;
|
||||
unsigned int n;
|
||||
unsigned int k;
|
||||
const __m256 eights = _mm256_set1_ps(8.0f);
|
||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m256 zeros = _mm256_setzero_ps();
|
||||
const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
|
||||
const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
|
||||
|
||||
__m256i local_code_chip_index_reg, i;
|
||||
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
indexn = n0;
|
||||
for(n = 0; n < avx_iters; n++)
|
||||
{
|
||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
|
||||
//aux = _mm256_add_ps(aux, aux2);
|
||||
// floor
|
||||
aux = _mm256_floor_ps(aux);
|
||||
|
||||
// fmod
|
||||
c = _mm256_div_ps(aux, code_length_chips_reg_f);
|
||||
//_mm_fmsub_ps(c, code_length_chips_reg_f, aux)
|
||||
i = _mm256_cvttps_epi32(c);
|
||||
cTrunc = _mm256_cvtepi32_ps(i);
|
||||
base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
|
||||
local_code_chip_index_reg = _mm256_cvttps_epi32(base);
|
||||
|
||||
// no negatives
|
||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||
aux = _mm256_add_ps(c, aux3);
|
||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||
|
||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(k = 0; k < 8; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for(n = avx_iters * 8; n < num_points; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t** _result = result;
|
||||
const unsigned int avx_iters = num_points / 8;
|
||||
int current_correlator_tap;
|
||||
unsigned int n;
|
||||
unsigned int k;
|
||||
const __m256 eights = _mm256_set1_ps(8.0f);
|
||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m256 zeros = _mm256_setzero_ps();
|
||||
const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
|
||||
const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
|
||||
|
||||
__m256i local_code_chip_index_reg, i;
|
||||
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
|
||||
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
indexn = n0;
|
||||
for(n = 0; n < avx_iters; n++)
|
||||
{
|
||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
|
||||
//aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
//aux = _mm256_add_ps(aux, aux2);
|
||||
// floor
|
||||
aux = _mm256_floor_ps(aux);
|
||||
|
||||
// fmod
|
||||
c = _mm256_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm256_cvttps_epi32(c);
|
||||
cTrunc = _mm256_cvtepi32_ps(i);
|
||||
base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
|
||||
local_code_chip_index_reg = _mm256_cvttps_epi32(base);
|
||||
|
||||
// no negatives
|
||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||
aux = _mm256_add_ps(c, aux3);
|
||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||
|
||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(k = 0; k < 8; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for(n = avx_iters * 8; n < num_points; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_NEON
|
||||
#include <arm_neon.h>
|
||||
|
||||
@ -604,4 +760,3 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
|
||||
|
||||
|
||||
#endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/
|
||||
|
||||
|
@ -152,6 +152,83 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const char* inputBuffer, unsigned int num_points)
|
||||
{
|
||||
char returnValue = 0;
|
||||
const unsigned int sse_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
|
||||
const char* aPtr = inputBuffer;
|
||||
|
||||
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
|
||||
__m256i accumulator = _mm256_setzero_si256();
|
||||
__m256i aVal = _mm256_setzero_si256();
|
||||
|
||||
for(number = 0; number < sse_iters; number++)
|
||||
{
|
||||
aVal = _mm256_load_si256((__m256i*)aPtr);
|
||||
accumulator = _mm256_add_epi8(accumulator, aVal);
|
||||
aPtr += 32;
|
||||
}
|
||||
_mm256_store_si256((__m256i*)tempBuffer,accumulator);
|
||||
|
||||
for(i = 0; i < 32; ++i)
|
||||
{
|
||||
returnValue += tempBuffer[i];
|
||||
}
|
||||
|
||||
for(i = 0; i < (num_points % 32); ++i)
|
||||
{
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
|
||||
*result = returnValue;
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const char* inputBuffer, unsigned int num_points)
|
||||
{
|
||||
char returnValue = 0;
|
||||
const unsigned int sse_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
const char* aPtr = inputBuffer;
|
||||
|
||||
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
|
||||
__m256i accumulator = _mm256_setzero_si256();
|
||||
__m256i aVal = _mm256_setzero_si256();
|
||||
|
||||
for(number = 0; number < sse_iters; number++)
|
||||
{
|
||||
aVal = _mm256_lddqu_si256((__m256i*)aPtr);
|
||||
accumulator = _mm256_add_epi8(accumulator, aVal);
|
||||
aPtr += 32;
|
||||
}
|
||||
_mm256_storeu_si256((__m256i*)tempBuffer, accumulator);
|
||||
|
||||
for(i = 0; i < 32; ++i)
|
||||
{
|
||||
returnValue += tempBuffer[i];
|
||||
}
|
||||
|
||||
for(i = 0; i < (num_points % 32); ++i)
|
||||
{
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
|
||||
*result = returnValue;
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_ORC
|
||||
|
||||
extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
|
||||
@ -169,4 +246,3 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const cha
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_H */
|
||||
|
||||
|
@ -58,6 +58,71 @@
|
||||
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include<immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
char* basePtr = (char*)src0;
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
unsigned int index = 0;
|
||||
unsigned int mask;
|
||||
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
|
||||
__m256i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm256_set1_epi8(max);
|
||||
|
||||
for(number = 0; number < avx2_iters; number++)
|
||||
{
|
||||
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
|
||||
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
|
||||
mask = _mm256_movemask_epi8(compareResults);
|
||||
|
||||
if (mask != 0xFFFFFFFF)
|
||||
{
|
||||
_mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues);
|
||||
mask = ~mask;
|
||||
i = 0;
|
||||
while (mask > 0)
|
||||
{
|
||||
if ((mask & 1) == 1)
|
||||
{
|
||||
if(currentValuesBuffer[i] > max)
|
||||
{
|
||||
index = inputPtr - basePtr + i;
|
||||
max = currentValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
i++;
|
||||
mask >>= 1;
|
||||
}
|
||||
maxValues = _mm256_set1_epi8(max);
|
||||
}
|
||||
inputPtr += 32;
|
||||
}
|
||||
|
||||
for(i = 0; i<(num_points % 32); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
index = i;
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target[0] = index;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_AVX2*/
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX
|
||||
#include <immintrin.h>
|
||||
|
||||
@ -271,6 +336,70 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include<immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
char* basePtr = (char*)src0;
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
unsigned int index = 0;
|
||||
unsigned int mask;
|
||||
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
|
||||
__m256i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm256_set1_epi8(max);
|
||||
|
||||
for(number = 0; number < avx2_iters; number++)
|
||||
{
|
||||
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
||||
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
|
||||
mask = _mm256_movemask_epi8(compareResults);
|
||||
|
||||
if (mask != 0xFFFFFFFF)
|
||||
{
|
||||
_mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues);
|
||||
mask = ~mask;
|
||||
i = 0;
|
||||
while (mask > 0)
|
||||
{
|
||||
if ((mask & 1) == 1)
|
||||
{
|
||||
if(currentValuesBuffer[i] > max)
|
||||
{
|
||||
index = inputPtr - basePtr + i;
|
||||
max = currentValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
i++;
|
||||
mask >>= 1;
|
||||
}
|
||||
maxValues = _mm256_set1_epi8(max);
|
||||
}
|
||||
inputPtr += 32;
|
||||
}
|
||||
|
||||
for(i = 0; i<(num_points % 32); ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
index = i;
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target[0] = index;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_AVX2*/
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX
|
||||
#include <immintrin.h>
|
||||
|
||||
|
@ -58,6 +58,55 @@
|
||||
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
const unsigned int avx_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
|
||||
__m256i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm256_set1_epi8(max);
|
||||
|
||||
for(number = 0; number < avx_iters; number++)
|
||||
{
|
||||
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
|
||||
compareResults = _mm256_max_epi8(maxValues, currentValues);
|
||||
maxValues = compareResults;
|
||||
inputPtr += 32;
|
||||
}
|
||||
|
||||
_mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues);
|
||||
|
||||
for(i = 0; i < 32; ++i)
|
||||
{
|
||||
if(maxValuesBuffer[i] > max)
|
||||
{
|
||||
max = maxValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
for(i = avx_iters * 32; i < num_points; ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target[0] = max;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE4_1*/
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
|
||||
@ -238,6 +287,55 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
|
||||
#endif /*LV_HAVE_SSE4_1*/
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points)
|
||||
{
|
||||
if(num_points > 0)
|
||||
{
|
||||
const unsigned int avx_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
char* inputPtr = (char*)src0;
|
||||
char max = src0[0];
|
||||
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
|
||||
__m256i maxValues, compareResults, currentValues;
|
||||
|
||||
maxValues = _mm256_set1_epi8(max);
|
||||
|
||||
for(number = 0; number < avx_iters; number++)
|
||||
{
|
||||
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
||||
compareResults = _mm256_max_epi8(maxValues, currentValues);
|
||||
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
|
||||
inputPtr += 32;
|
||||
}
|
||||
|
||||
_mm256_store_si256((__m256i*)maxValuesBuffer, maxValues);
|
||||
|
||||
for(i = 0; i < 32; ++i)
|
||||
{
|
||||
if(maxValuesBuffer[i] > max)
|
||||
{
|
||||
max = maxValuesBuffer[i];
|
||||
}
|
||||
}
|
||||
|
||||
for(i = avx_iters * 32; i < num_points; ++i)
|
||||
{
|
||||
if(src0[i] > max)
|
||||
{
|
||||
max = src0[i];
|
||||
}
|
||||
}
|
||||
target[0] = max;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE4_1*/
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE2
|
||||
#include <emmintrin.h>
|
||||
|
||||
|
@ -94,6 +94,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
char* cPtr = cVector;
|
||||
const char* aPtr = aVector;
|
||||
const char* bPtr = bVector;
|
||||
|
||||
__m256i aVal, bVal, cVal;
|
||||
|
||||
for(number = 0; number < avx_iters; number++)
|
||||
{
|
||||
aVal = _mm256_loadu_si256((__m256i*)aPtr);
|
||||
bVal = _mm256_loadu_si256((__m256i*)bPtr);
|
||||
|
||||
cVal = _mm256_add_epi8(aVal, bVal);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
|
||||
|
||||
aPtr += 32;
|
||||
bPtr += 32;
|
||||
cPtr += 32;
|
||||
}
|
||||
|
||||
for(i = avx_iters * 32; i < num_points; ++i)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
|
||||
@ -147,6 +183,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
char* cPtr = cVector;
|
||||
const char* aPtr = aVector;
|
||||
const char* bPtr = bVector;
|
||||
|
||||
__m256i aVal, bVal, cVal;
|
||||
|
||||
for(number = 0; number < avx_iters; number++)
|
||||
{
|
||||
aVal = _mm256_load_si256((__m256i*)aPtr);
|
||||
bVal = _mm256_load_si256((__m256i*)bPtr);
|
||||
|
||||
cVal = _mm256_add_epi8(aVal, bVal);
|
||||
|
||||
_mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
|
||||
|
||||
aPtr += 32;
|
||||
bPtr += 32;
|
||||
cPtr += 32;
|
||||
}
|
||||
|
||||
for(i = avx_iters * 32; i < num_points; ++i)
|
||||
{
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_ORC
|
||||
|
||||
extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);
|
||||
|
@ -59,6 +59,37 @@
|
||||
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 16;
|
||||
unsigned int i;
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
|
||||
__m256i tmp;
|
||||
__m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
|
||||
|
||||
for (i = 0; i < avx2_iters; ++i)
|
||||
{
|
||||
tmp = _mm256_loadu_si256((__m256i*)a);
|
||||
tmp = _mm256_sign_epi8(tmp, conjugator);
|
||||
_mm256_storeu_si256((__m256i*)c, tmp);
|
||||
|
||||
a += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
for (i = avx2_iters * 16; i < num_points; ++i)
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_AVX2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX
|
||||
#include <immintrin.h>
|
||||
|
||||
@ -217,6 +248,37 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
|
||||
#endif /* LV_HAVE_AVX */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 16;
|
||||
unsigned int i;
|
||||
lv_8sc_t* c = cVector;
|
||||
const lv_8sc_t* a = aVector;
|
||||
|
||||
__m256i tmp;
|
||||
__m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
|
||||
|
||||
for (i = 0; i < avx2_iters; ++i)
|
||||
{
|
||||
tmp = _mm256_load_si256((__m256i*)a);
|
||||
tmp = _mm256_sign_epi8(tmp, conjugator);
|
||||
_mm256_store_si256((__m256i*)c, tmp);
|
||||
|
||||
a += 16;
|
||||
c += 16;
|
||||
}
|
||||
|
||||
for (i = avx2_iters * 16; i < num_points; ++i)
|
||||
{
|
||||
*c++ = lv_conj(*a++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_AVX2 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
|
||||
|
@ -58,6 +58,56 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H
|
||||
#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
|
||||
__m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
|
||||
unsigned char* c = cChar;
|
||||
const unsigned char* a = aChar;
|
||||
const unsigned char* b = bChar;
|
||||
|
||||
for(number = 0; number < avx2_iters; number++)
|
||||
{
|
||||
x = _mm256_loadu_si256((__m256i*)a);
|
||||
y = _mm256_loadu_si256((__m256i*)b);
|
||||
|
||||
mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
||||
x1 = _mm256_srli_si256(x, 1);
|
||||
x1 = _mm256_and_si256(x1, mult1);
|
||||
x2 = _mm256_and_si256(x, mult1);
|
||||
|
||||
y1 = _mm256_srli_si256(y, 1);
|
||||
y1 = _mm256_and_si256(y1, mult1);
|
||||
y2 = _mm256_and_si256(y, mult1);
|
||||
|
||||
x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
|
||||
x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
|
||||
|
||||
tmp = _mm256_and_si256(x1_mult_y1, mult1);
|
||||
tmp1 = _mm256_slli_si256(tmp, 1);
|
||||
tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
|
||||
totalc = _mm256_or_si256(tmp1, tmp2);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)c, totalc);
|
||||
|
||||
a += 32;
|
||||
b += 32;
|
||||
c += 32;
|
||||
}
|
||||
|
||||
for (i = avx2_iters * 32; i < num_points ; ++i)
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
@ -176,6 +226,57 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX2
|
||||
#include <immintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
|
||||
{
|
||||
const unsigned int avx2_iters = num_points / 32;
|
||||
unsigned int number;
|
||||
unsigned int i;
|
||||
|
||||
__m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
|
||||
unsigned char* c = cChar;
|
||||
const unsigned char* a = aChar;
|
||||
const unsigned char* b = bChar;
|
||||
|
||||
for(number = 0; number < avx2_iters; number++)
|
||||
{
|
||||
x = _mm256_load_si256((__m256i*)a);
|
||||
y = _mm256_load_si256((__m256i*)b);
|
||||
|
||||
mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
||||
x1 = _mm256_srli_si256(x, 1);
|
||||
x1 = _mm256_and_si256(x1, mult1);
|
||||
x2 = _mm256_and_si256(x, mult1);
|
||||
|
||||
y1 = _mm256_srli_si256(y, 1);
|
||||
y1 = _mm256_and_si256(y1, mult1);
|
||||
y2 = _mm256_and_si256(y, mult1);
|
||||
|
||||
x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
|
||||
x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
|
||||
|
||||
tmp = _mm256_and_si256(x1_mult_y1, mult1);
|
||||
tmp1 = _mm256_slli_si256(tmp, 1);
|
||||
tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
|
||||
totalc = _mm256_or_si256(tmp1, tmp2);
|
||||
|
||||
_mm256_store_si256((__m256i*)c, totalc);
|
||||
|
||||
a += 32;
|
||||
b += 32;
|
||||
c += 32;
|
||||
}
|
||||
|
||||
for (i = avx2_iters * 32; i < num_points ; ++i)
|
||||
{
|
||||
*c++ = (*a++) * (*b++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_ORC
|
||||
|
||||
extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);
|
||||
|
Loading…
Reference in New Issue
Block a user