1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-06-26 06:53:14 +00:00

avx2: Adds support for AVX2 intrinsic to some kernels

Adds AVX2 intrinsic to several kernels ranging from multiply and add to
more detailed operations such as convert. New peotokernels also may take
advantage of the fused multiply add (fma)
This commit is contained in:
Damian Miralles 2017-08-08 08:19:42 -07:00 committed by Carles Fernandez
parent 6e65705b47
commit f4c221609c
10 changed files with 1001 additions and 5 deletions

View File

@ -164,6 +164,58 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 8;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux;
unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2;
__m256i intInputVal1, intInputVal2;
__m256 ret1, ret2;
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm256_cvtps_epi32(ret1);
intInputVal2 = _mm256_cvtps_epi32(ret2);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
_mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 16;
}
for(i = avx2_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
aux = max_val;
else if(aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
@ -269,6 +321,59 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 8;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux;
unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2;
__m256i intInputVal1, intInputVal2;
__m256 ret1, ret2;
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm256_cvtps_epi32(ret1);
intInputVal2 = _mm256_cvtps_epi32(ret2);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
_mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 16;
}
for(i = avx2_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
aux = max_val;
else if(aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>

View File

@ -85,6 +85,146 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 16;
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
const float min_val = (float)SCHAR_MIN;
const float max_val = (float)SCHAR_MAX;
float aux;
unsigned int i;
__m256 inputVal1, inputVal2, inputVal3, inputVal4;
__m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
__m256i int8InputVal;
__m256 ret1, ret2, ret3, ret4;
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
// Clip
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm256_cvtps_epi32(ret1);
intInputVal2 = _mm256_cvtps_epi32(ret2);
intInputVal3 = _mm256_cvtps_epi32(ret3);
intInputVal4 = _mm256_cvtps_epi32(ret4);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
_mm256_storeu_si256((__m256i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 32;
}
for(i = avx2_iters * 32; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
aux = max_val;
else if(aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 16;
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
const float min_val = (float)SCHAR_MIN;
const float max_val = (float)SCHAR_MAX;
float aux;
unsigned int i;
__m256 inputVal1, inputVal2, inputVal3, inputVal4;
__m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
__m256i int8InputVal;
__m256 ret1, ret2, ret3, ret4;
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
// Clip
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm256_cvtps_epi32(ret1);
intInputVal2 = _mm256_cvtps_epi32(ret2);
intInputVal3 = _mm256_cvtps_epi32(ret3);
intInputVal4 = _mm256_cvtps_epi32(ret4);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
_mm256_store_si256((__m256i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 32;
}
for(i = avx2_iters * 32; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
aux = max_val;
else if(aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>

View File

@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
@ -248,6 +248,64 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
#endif
#ifdef LV_HAVE_AVX2
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
float code_phase_step_chips = -0.6;
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
#ifdef LV_HAVE_AVX2
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
float code_phase_step_chips = -0.6;
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{

View File

@ -160,7 +160,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
}
}
#endif
#endif
#ifdef LV_HAVE_SSE3
@ -295,7 +295,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
}
}
#endif
#endif
#ifdef LV_HAVE_SSE4_1
@ -518,6 +518,162 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
#endif
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{
lv_32fc_t** _result = result;
const unsigned int avx_iters = num_points / 8;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m256 eights = _mm256_set1_ps(8.0f);
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
__m256i local_code_chip_index_reg, i;
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
//aux = _mm256_add_ps(aux, aux2);
// floor
aux = _mm256_floor_ps(aux);
// fmod
c = _mm256_div_ps(aux, code_length_chips_reg_f);
//_mm_fmsub_ps(c, code_length_chips_reg_f, aux)
i = _mm256_cvttps_epi32(c);
cTrunc = _mm256_cvtepi32_ps(i);
base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
local_code_chip_index_reg = _mm256_cvttps_epi32(base);
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
}
}
#endif
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{
lv_32fc_t** _result = result;
const unsigned int avx_iters = num_points / 8;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m256 eights = _mm256_set1_ps(8.0f);
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
__m256i local_code_chip_index_reg, i;
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
//aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
//aux = _mm256_add_ps(aux, aux2);
// floor
aux = _mm256_floor_ps(aux);
// fmod
c = _mm256_div_ps(aux, code_length_chips_reg_f);
i = _mm256_cvttps_epi32(c);
cTrunc = _mm256_cvtepi32_ps(i);
base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
local_code_chip_index_reg = _mm256_cvttps_epi32(base);
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
}
}
#endif
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
@ -604,4 +760,3 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
#endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/

View File

@ -152,6 +152,83 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const char* inputBuffer, unsigned int num_points)
{
char returnValue = 0;
const unsigned int sse_iters = num_points / 32;
unsigned int number;
unsigned int i;
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256();
for(number = 0; number < sse_iters; number++)
{
aVal = _mm256_load_si256((__m256i*)aPtr);
accumulator = _mm256_add_epi8(accumulator, aVal);
aPtr += 32;
}
_mm256_store_si256((__m256i*)tempBuffer,accumulator);
for(i = 0; i < 32; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 32); ++i)
{
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_AVX2
#include <pmmintrin.h>
static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const char* inputBuffer, unsigned int num_points)
{
char returnValue = 0;
const unsigned int sse_iters = num_points / 32;
unsigned int number;
unsigned int i;
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256();
for(number = 0; number < sse_iters; number++)
{
aVal = _mm256_lddqu_si256((__m256i*)aPtr);
accumulator = _mm256_add_epi8(accumulator, aVal);
aPtr += 32;
}
_mm256_storeu_si256((__m256i*)tempBuffer, accumulator);
for(i = 0; i < 32; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 32); ++i)
{
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_ORC
extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
@ -169,4 +246,3 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const cha
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_H */

View File

@ -58,6 +58,71 @@
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#ifdef LV_HAVE_AVX2
#include<immintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx2_iters; number++)
{
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
mask = _mm256_movemask_epi8(compareResults);
if (mask != 0xFFFFFFFF)
{
_mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
mask = ~mask;
i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm256_set1_epi8(max);
}
inputPtr += 32;
}
for(i = 0; i<(num_points % 32); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_AVX2*/
#ifdef LV_HAVE_AVX
#include <immintrin.h>
@ -271,6 +336,70 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_AVX2
#include<immintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx2_iters; number++)
{
currentValues = _mm256_load_si256((__m256i*)inputPtr);
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
mask = _mm256_movemask_epi8(compareResults);
if (mask != 0xFFFFFFFF)
{
_mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
mask = ~mask;
i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm256_set1_epi8(max);
}
inputPtr += 32;
}
for(i = 0; i<(num_points % 32); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_AVX2*/
#ifdef LV_HAVE_AVX
#include <immintrin.h>

View File

@ -58,6 +58,55 @@
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx_iters; number++)
{
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
compareResults = _mm256_max_epi8(maxValues, currentValues);
maxValues = compareResults;
inputPtr += 32;
}
_mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues);
for(i = 0; i < 32; ++i)
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(i = avx_iters * 32; i < num_points; ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target[0] = max;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
@ -238,6 +287,55 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx_iters; number++)
{
currentValues = _mm256_load_si256((__m256i*)inputPtr);
compareResults = _mm256_max_epi8(maxValues, currentValues);
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 32;
}
_mm256_store_si256((__m256i*)maxValuesBuffer, maxValues);
for(i = 0; i < 32; ++i)
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(i = avx_iters * 32; i < num_points; ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target[0] = max;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>

View File

@ -94,6 +94,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr = bVector;
__m256i aVal, bVal, cVal;
for(number = 0; number < avx_iters; number++)
{
aVal = _mm256_loadu_si256((__m256i*)aPtr);
bVal = _mm256_loadu_si256((__m256i*)bPtr);
cVal = _mm256_add_epi8(aVal, bVal);
_mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
aPtr += 32;
bPtr += 32;
cPtr += 32;
}
for(i = avx_iters * 32; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
@ -147,6 +183,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr = bVector;
__m256i aVal, bVal, cVal;
for(number = 0; number < avx_iters; number++)
{
aVal = _mm256_load_si256((__m256i*)aPtr);
bVal = _mm256_load_si256((__m256i*)bPtr);
cVal = _mm256_add_epi8(aVal, bVal);
_mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
aPtr += 32;
bPtr += 32;
cPtr += 32;
}
for(i = avx_iters * 32; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_ORC
extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);

View File

@ -59,6 +59,37 @@
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 16;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m256i tmp;
__m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (i = 0; i < avx2_iters; ++i)
{
tmp = _mm256_loadu_si256((__m256i*)a);
tmp = _mm256_sign_epi8(tmp, conjugator);
_mm256_storeu_si256((__m256i*)c, tmp);
a += 16;
c += 16;
}
for (i = avx2_iters * 16; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
@ -217,6 +248,37 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 16;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m256i tmp;
__m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (i = 0; i < avx2_iters; ++i)
{
tmp = _mm256_load_si256((__m256i*)a);
tmp = _mm256_sign_epi8(tmp, conjugator);
_mm256_store_si256((__m256i*)c, tmp);
a += 16;
c += 16;
}
for (i = avx2_iters * 16; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSSE3
#include <tmmintrin.h>

View File

@ -58,6 +58,56 @@
#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H
#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
unsigned int i;
__m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
unsigned char* c = cChar;
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(number = 0; number < avx2_iters; number++)
{
x = _mm256_loadu_si256((__m256i*)a);
y = _mm256_loadu_si256((__m256i*)b);
mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
x1 = _mm256_srli_si256(x, 1);
x1 = _mm256_and_si256(x1, mult1);
x2 = _mm256_and_si256(x, mult1);
y1 = _mm256_srli_si256(y, 1);
y1 = _mm256_and_si256(y1, mult1);
y2 = _mm256_and_si256(y, mult1);
x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
tmp = _mm256_and_si256(x1_mult_y1, mult1);
tmp1 = _mm256_slli_si256(tmp, 1);
tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
totalc = _mm256_or_si256(tmp1, tmp2);
_mm256_storeu_si256((__m256i*)c, totalc);
a += 32;
b += 32;
c += 32;
}
for (i = avx2_iters * 32; i < num_points ; ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
@ -176,6 +226,57 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
unsigned int i;
__m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
unsigned char* c = cChar;
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(number = 0; number < avx2_iters; number++)
{
x = _mm256_load_si256((__m256i*)a);
y = _mm256_load_si256((__m256i*)b);
mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
x1 = _mm256_srli_si256(x, 1);
x1 = _mm256_and_si256(x1, mult1);
x2 = _mm256_and_si256(x, mult1);
y1 = _mm256_srli_si256(y, 1);
y1 = _mm256_and_si256(y1, mult1);
y2 = _mm256_and_si256(y, mult1);
x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
tmp = _mm256_and_si256(x1_mult_y1, mult1);
tmp1 = _mm256_slli_si256(tmp, 1);
tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
totalc = _mm256_or_si256(tmp1, tmp2);
_mm256_store_si256((__m256i*)c, totalc);
a += 32;
b += 32;
c += 32;
}
for (i = avx2_iters * 32; i < num_points ; ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_ORC
extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);