From f4c221609c50cd9062ad436822d44cd540946b44 Mon Sep 17 00:00:00 2001 From: Damian Miralles Date: Tue, 8 Aug 2017 08:19:42 -0700 Subject: [PATCH] avx2: Adds support for AVX2 intrinsic to some kernels Adds AVX2 intrinsic to several kernels ranging from multiply and add to more detailed operations such as convert. New peotokernels also may take advantage of the fused multiply add (fma) --- .../volk_gnsssdr_32fc_convert_16ic.h | 105 ++++++++++++ .../volk_gnsssdr_32fc_convert_8ic.h | 140 +++++++++++++++ ...volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h | 60 ++++++- .../volk_gnsssdr_32fc_xn_resampler_32fc_xn.h | 161 +++++++++++++++++- .../volk_gnsssdr_8i_accumulator_s8i.h | 78 ++++++++- .../volk_gnsssdr_8i_index_max_16u.h | 129 ++++++++++++++ .../volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h | 98 +++++++++++ .../volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h | 72 ++++++++ .../volk_gnsssdr_8ic_conjugate_8ic.h | 62 +++++++ .../volk_gnsssdr_8u_x2_multiply_8u.h | 101 +++++++++++ 10 files changed, 1001 insertions(+), 5 deletions(-) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h index c4aeea1a2..b04a93c4b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h @@ -164,6 +164,58 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, } #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 8; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + float aux; + unsigned int i; + const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast + const float max_val = (float)SHRT_MAX; + + __m256 inputVal1, inputVal2; + __m256i intInputVal1, intInputVal2; + __m256 ret1, ret2; + const __m256 vmin_val = _mm256_set1_ps(min_val); + const __m256 vmax_val = _mm256_set1_ps(max_val); + + for(i = 0; i < avx2_iters; i++) + { + inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); + + // Clip + ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + for(i = avx2_iters * 16; i < num_points * 2; i++) + { + aux = *inputVectorPtr++; + if(aux > max_val) + aux = max_val; + else if(aux < min_val) + aux = min_val; + *outputVectorPtr++ = (int16_t)rintf(aux); + } +} +#endif /* LV_HAVE_AVX2 */ + #ifdef LV_HAVE_SSE2 #include @@ -269,6 +321,59 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 8; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + float aux; + unsigned int i; + const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast + const float max_val = (float)SHRT_MAX; + + __m256 inputVal1, inputVal2; + __m256i intInputVal1, intInputVal2; + __m256 ret1, ret2; + const __m256 vmin_val = _mm256_set1_ps(min_val); + const __m256 vmax_val = _mm256_set1_ps(max_val); + + for(i = 0; i < avx2_iters; i++) + { + inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); + + // Clip + ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + + _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 16; + } + + for(i = avx2_iters * 16; i < num_points * 2; i++) + { + aux = *inputVectorPtr++; + if(aux > max_val) + aux = max_val; + else if(aux < min_val) + aux = min_val; + *outputVectorPtr++ = (int16_t)rintf(aux); + } +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_NEON #include diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h index b04b1072f..ca5f13f22 100755 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h @@ -85,6 +85,146 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 16; + + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + + const float min_val = (float)SCHAR_MIN; + const float max_val = (float)SCHAR_MAX; + float aux; + unsigned int i; + + __m256 inputVal1, inputVal2, inputVal3, inputVal4; + __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m256i int8InputVal; + __m256 ret1, ret2, ret3, ret4; + const __m256 vmin_val = _mm256_set1_ps(min_val); + const __m256 vmax_val = _mm256_set1_ps(max_val); + + for(i = 0; i < avx2_iters; i++) + { + inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); + + inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); + inputVal2 = _mm256_mul_ps(inputVal2, vmax_val); + inputVal3 = _mm256_mul_ps(inputVal3, vmax_val); + inputVal4 = _mm256_mul_ps(inputVal4, vmax_val); + + // Clip + ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + intInputVal3 = _mm256_cvtps_epi32(ret3); + intInputVal4 = _mm256_cvtps_epi32(ret4); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4); + intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000); + int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2); + int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000); + + _mm256_storeu_si256((__m256i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 32; + } + + for(i = avx2_iters * 32; i < num_points * 2; i++) + { + aux = *inputVectorPtr++ * max_val; + if(aux > max_val) + aux = max_val; + else if(aux < min_val) + aux = min_val; + *outputVectorPtr++ = (int8_t)rintf(aux); + } +} +#endif /* LV_HAVE_AVX2 */ + + +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 16; + + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + + const float min_val = (float)SCHAR_MIN; + const float max_val = (float)SCHAR_MAX; + float aux; + unsigned int i; + + __m256 inputVal1, inputVal2, inputVal3, inputVal4; + __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m256i int8InputVal; + __m256 ret1, ret2, ret3, ret4; + const __m256 vmin_val = _mm256_set1_ps(min_val); + const __m256 vmax_val = _mm256_set1_ps(max_val); + + for(i = 0; i < avx2_iters; i++) + { + inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); + + inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); + inputVal2 = _mm256_mul_ps(inputVal2, vmax_val); + inputVal3 = _mm256_mul_ps(inputVal3, vmax_val); + inputVal4 = _mm256_mul_ps(inputVal4, vmax_val); + + // Clip + ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm256_cvtps_epi32(ret1); + intInputVal2 = _mm256_cvtps_epi32(ret2); + intInputVal3 = _mm256_cvtps_epi32(ret3); + intInputVal4 = _mm256_cvtps_epi32(ret4); + + intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2); + intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000); + intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4); + intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000); + int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2); + int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000); + + _mm256_store_si256((__m256i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 32; + } + + for(i = avx2_iters * 32; i < num_points * 2; i++) + { + aux = *inputVectorPtr++ * max_val; + if(aux > max_val) + aux = max_val; + else if(aux < min_val) + aux = min_val; + *outputVectorPtr++ = (int8_t)rintf(aux); + } +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_SSE2 #include diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h index c109a1be3..f89a1461b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h @@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r } #endif /* LV_HAVE_GENERIC */ - + #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) @@ -248,6 +248,64 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res #endif +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) +{ + float code_phase_step_chips = -0.6; + int code_length_chips = 1023; + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} +#endif + + +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) +{ + float code_phase_step_chips = -0.6; + int code_length_chips = 1023; + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} +#endif + + #ifdef LV_HAVE_NEON static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h index 9149a0bb9..f8db65944 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h @@ -160,7 +160,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res } } -#endif +#endif #ifdef LV_HAVE_SSE3 @@ -295,7 +295,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r } } -#endif +#endif #ifdef LV_HAVE_SSE4_1 @@ -518,6 +518,162 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu #endif +#ifdef LV_HAVE_AVX2 +#include +static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + lv_32fc_t** _result = result; + const unsigned int avx_iters = num_points / 8; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m256 eights = _mm256_set1_ps(8.0f); + const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); + const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + int local_code_chip_index_; + + const __m256 zeros = _mm256_setzero_ps(); + const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips); + const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f); + + __m256i local_code_chip_index_reg, i; + __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); + aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + indexn = n0; + for(n = 0; n < avx_iters; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); + aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); + //aux = _mm256_add_ps(aux, aux2); + // floor + aux = _mm256_floor_ps(aux); + + // fmod + c = _mm256_div_ps(aux, code_length_chips_reg_f); + //_mm_fmsub_ps(c, code_length_chips_reg_f, aux) + i = _mm256_cvttps_epi32(c); + cTrunc = _mm256_cvtepi32_ps(i); + base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux); + local_code_chip_index_reg = _mm256_cvttps_epi32(base); + + // no negatives + c = _mm256_cvtepi32_ps(local_code_chip_index_reg); + negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); + aux = _mm256_add_ps(c, aux3); + local_code_chip_index_reg = _mm256_cvttps_epi32(aux); + + _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 8; ++k) + { + _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm256_add_ps(indexn, eights); + } + } + _mm256_zeroupper(); + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + for(n = avx_iters * 8; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_AVX2 +#include +static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + lv_32fc_t** _result = result; + const unsigned int avx_iters = num_points / 8; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m256 eights = _mm256_set1_ps(8.0f); + const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); + const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + int local_code_chip_index_; + + const __m256 zeros = _mm256_setzero_ps(); + const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips); + const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f); + + __m256i local_code_chip_index_reg, i; + __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); + aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + indexn = n0; + for(n = 0; n < avx_iters; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); + aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); + //aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); + //aux = _mm256_add_ps(aux, aux2); + // floor + aux = _mm256_floor_ps(aux); + + // fmod + c = _mm256_div_ps(aux, code_length_chips_reg_f); + i = _mm256_cvttps_epi32(c); + cTrunc = _mm256_cvtepi32_ps(i); + base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux); + local_code_chip_index_reg = _mm256_cvttps_epi32(base); + + // no negatives + c = _mm256_cvtepi32_ps(local_code_chip_index_reg); + negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); + aux = _mm256_add_ps(c, aux3); + local_code_chip_index_reg = _mm256_cvttps_epi32(aux); + + _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 8; ++k) + { + _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm256_add_ps(indexn, eights); + } + } + _mm256_zeroupper(); + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + for(n = avx_iters * 8; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + #ifdef LV_HAVE_NEON #include @@ -604,4 +760,3 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul #endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h index 99588fca5..8c2830cdc 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h @@ -152,6 +152,83 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch #endif /* LV_HAVE_SSE3 */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const char* inputBuffer, unsigned int num_points) +{ + char returnValue = 0; + const unsigned int sse_iters = num_points / 32; + unsigned int number; + unsigned int i; + + const char* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; + __m256i accumulator = _mm256_setzero_si256(); + __m256i aVal = _mm256_setzero_si256(); + + for(number = 0; number < sse_iters; number++) + { + aVal = _mm256_load_si256((__m256i*)aPtr); + accumulator = _mm256_add_epi8(accumulator, aVal); + aPtr += 32; + } + _mm256_store_si256((__m256i*)tempBuffer,accumulator); + + for(i = 0; i < 32; ++i) + { + returnValue += tempBuffer[i]; + } + + for(i = 0; i < (num_points % 32); ++i) + { + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_SSE3 */ + + +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const char* inputBuffer, unsigned int num_points) +{ + char returnValue = 0; + const unsigned int sse_iters = num_points / 32; + unsigned int number; + unsigned int i; + const char* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; + __m256i accumulator = _mm256_setzero_si256(); + __m256i aVal = _mm256_setzero_si256(); + + for(number = 0; number < sse_iters; number++) + { + aVal = _mm256_lddqu_si256((__m256i*)aPtr); + accumulator = _mm256_add_epi8(accumulator, aVal); + aPtr += 32; + } + _mm256_storeu_si256((__m256i*)tempBuffer, accumulator); + + for(i = 0; i < 32; ++i) + { + returnValue += tempBuffer[i]; + } + + for(i = 0; i < (num_points % 32); ++i) + { + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_SSE3 */ + + #ifdef LV_HAVE_ORC extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points); @@ -169,4 +246,3 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const cha #endif /* LV_HAVE_ORC */ #endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_H */ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h index 75ad588d2..1f053f239 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h @@ -58,6 +58,71 @@ #include + +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points) +{ + if(num_points > 0) + { + const unsigned int avx2_iters = num_points / 32; + unsigned int number; + unsigned int i; + char* basePtr = (char*)src0; + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned int index = 0; + unsigned int mask; + __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __m256i maxValues, compareResults, currentValues; + + maxValues = _mm256_set1_epi8(max); + + for(number = 0; number < avx2_iters; number++) + { + currentValues = _mm256_loadu_si256((__m256i*)inputPtr); + compareResults = _mm256_cmpgt_epi8(maxValues, currentValues); + mask = _mm256_movemask_epi8(compareResults); + + if (mask != 0xFFFFFFFF) + { + _mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues); + mask = ~mask; + i = 0; + while (mask > 0) + { + if ((mask & 1) == 1) + { + if(currentValuesBuffer[i] > max) + { + index = inputPtr - basePtr + i; + max = currentValuesBuffer[i]; + } + } + i++; + mask >>= 1; + } + maxValues = _mm256_set1_epi8(max); + } + inputPtr += 32; + } + + for(i = 0; i<(num_points % 32); ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_AVX2*/ + + #ifdef LV_HAVE_AVX #include @@ -271,6 +336,70 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c #endif /*LV_HAVE_GENERIC*/ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points) +{ + if(num_points > 0) + { + const unsigned int avx2_iters = num_points / 32; + unsigned int number; + unsigned int i; + char* basePtr = (char*)src0; + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned int index = 0; + unsigned int mask; + __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __m256i maxValues, compareResults, currentValues; + + maxValues = _mm256_set1_epi8(max); + + for(number = 0; number < avx2_iters; number++) + { + currentValues = _mm256_load_si256((__m256i*)inputPtr); + compareResults = _mm256_cmpgt_epi8(maxValues, currentValues); + mask = _mm256_movemask_epi8(compareResults); + + if (mask != 0xFFFFFFFF) + { + _mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues); + mask = ~mask; + i = 0; + while (mask > 0) + { + if ((mask & 1) == 1) + { + if(currentValuesBuffer[i] > max) + { + index = inputPtr - basePtr + i; + max = currentValuesBuffer[i]; + } + } + i++; + mask >>= 1; + } + maxValues = _mm256_set1_epi8(max); + } + inputPtr += 32; + } + + for(i = 0; i<(num_points % 32); ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_AVX2*/ + + #ifdef LV_HAVE_AVX #include diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h index 2e3bad400..109c4f779 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h @@ -58,6 +58,55 @@ #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points) +{ + if(num_points > 0) + { + const unsigned int avx_iters = num_points / 32; + unsigned int number; + unsigned int i; + char* inputPtr = (char*)src0; + char max = src0[0]; + __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; + __m256i maxValues, compareResults, currentValues; + + maxValues = _mm256_set1_epi8(max); + + for(number = 0; number < avx_iters; number++) + { + currentValues = _mm256_loadu_si256((__m256i*)inputPtr); + compareResults = _mm256_max_epi8(maxValues, currentValues); + maxValues = compareResults; + inputPtr += 32; + } + + _mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues); + + for(i = 0; i < 32; ++i) + { + if(maxValuesBuffer[i] > max) + { + max = maxValuesBuffer[i]; + } + } + + for(i = avx_iters * 32; i < num_points; ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target[0] = max; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + + #ifdef LV_HAVE_SSE4_1 #include @@ -238,6 +287,55 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr #endif /*LV_HAVE_SSE4_1*/ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points) +{ + if(num_points > 0) + { + const unsigned int avx_iters = num_points / 32; + unsigned int number; + unsigned int i; + char* inputPtr = (char*)src0; + char max = src0[0]; + __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; + __m256i maxValues, compareResults, currentValues; + + maxValues = _mm256_set1_epi8(max); + + for(number = 0; number < avx_iters; number++) + { + currentValues = _mm256_load_si256((__m256i*)inputPtr); + compareResults = _mm256_max_epi8(maxValues, currentValues); + maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults); + inputPtr += 32; + } + + _mm256_store_si256((__m256i*)maxValuesBuffer, maxValues); + + for(i = 0; i < 32; ++i) + { + if(maxValuesBuffer[i] > max) + { + max = maxValuesBuffer[i]; + } + } + + for(i = avx_iters * 32; i < num_points; ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target[0] = max; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + + #ifdef LV_HAVE_SSE2 #include diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h index 54460a3a2..3854319fd 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h @@ -94,6 +94,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points) +{ + const unsigned int avx_iters = num_points / 32; + unsigned int number; + unsigned int i; + char* cPtr = cVector; + const char* aPtr = aVector; + const char* bPtr = bVector; + + __m256i aVal, bVal, cVal; + + for(number = 0; number < avx_iters; number++) + { + aVal = _mm256_loadu_si256((__m256i*)aPtr); + bVal = _mm256_loadu_si256((__m256i*)bPtr); + + cVal = _mm256_add_epi8(aVal, bVal); + + _mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container + + aPtr += 32; + bPtr += 32; + cPtr += 32; + } + + for(i = avx_iters * 32; i < num_points; ++i) + { + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE2 */ + + #ifdef LV_HAVE_GENERIC static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points) @@ -147,6 +183,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points) +{ + const unsigned int avx_iters = num_points / 32; + unsigned int number; + unsigned int i; + char* cPtr = cVector; + const char* aPtr = aVector; + const char* bPtr = bVector; + + __m256i aVal, bVal, cVal; + + for(number = 0; number < avx_iters; number++) + { + aVal = _mm256_load_si256((__m256i*)aPtr); + bVal = _mm256_load_si256((__m256i*)bPtr); + + cVal = _mm256_add_epi8(aVal, bVal); + + _mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container + + aPtr += 32; + bPtr += 32; + cPtr += 32; + } + + for(i = avx_iters * 32; i < num_points; ++i) + { + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE2 */ + + #ifdef LV_HAVE_ORC extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h index 7e89fe5d1..830128a83 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h @@ -59,6 +59,37 @@ #include +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 16; + unsigned int i; + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + + __m256i tmp; + __m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); + + for (i = 0; i < avx2_iters; ++i) + { + tmp = _mm256_loadu_si256((__m256i*)a); + tmp = _mm256_sign_epi8(tmp, conjugator); + _mm256_storeu_si256((__m256i*)c, tmp); + + a += 16; + c += 16; + } + + for (i = avx2_iters * 16; i < num_points; ++i) + { + *c++ = lv_conj(*a++); + } +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_AVX #include @@ -217,6 +248,37 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const #endif /* LV_HAVE_AVX */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 16; + unsigned int i; + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + + __m256i tmp; + __m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); + + for (i = 0; i < avx2_iters; ++i) + { + tmp = _mm256_load_si256((__m256i*)a); + tmp = _mm256_sign_epi8(tmp, conjugator); + _mm256_store_si256((__m256i*)c, tmp); + + a += 16; + c += 16; + } + + for (i = avx2_iters * 16; i < num_points; ++i) + { + *c++ = lv_conj(*a++); + } +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_SSSE3 #include diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h index df7a0cd7d..8457b7f14 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h @@ -58,6 +58,56 @@ #ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H #define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 32; + unsigned int number; + unsigned int i; + + __m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; + unsigned char* c = cChar; + const unsigned char* a = aChar; + const unsigned char* b = bChar; + + for(number = 0; number < avx2_iters; number++) + { + x = _mm256_loadu_si256((__m256i*)a); + y = _mm256_loadu_si256((__m256i*)b); + + mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); + x1 = _mm256_srli_si256(x, 1); + x1 = _mm256_and_si256(x1, mult1); + x2 = _mm256_and_si256(x, mult1); + + y1 = _mm256_srli_si256(y, 1); + y1 = _mm256_and_si256(y1, mult1); + y2 = _mm256_and_si256(y, mult1); + + x1_mult_y1 = _mm256_mullo_epi16(x1, y1); + x2_mult_y2 = _mm256_mullo_epi16(x2, y2); + + tmp = _mm256_and_si256(x1_mult_y1, mult1); + tmp1 = _mm256_slli_si256(tmp, 1); + tmp2 = _mm256_and_si256(x2_mult_y2, mult1); + totalc = _mm256_or_si256(tmp1, tmp2); + + _mm256_storeu_si256((__m256i*)c, totalc); + + a += 32; + b += 32; + c += 32; + } + + for (i = avx2_iters * 32; i < num_points ; ++i) + { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_SSE3 */ + #ifdef LV_HAVE_SSE3 #include @@ -176,6 +226,57 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c #endif /* LV_HAVE_SSE */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 32; + unsigned int number; + unsigned int i; + + __m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; + unsigned char* c = cChar; + const unsigned char* a = aChar; + const unsigned char* b = bChar; + + for(number = 0; number < avx2_iters; number++) + { + x = _mm256_load_si256((__m256i*)a); + y = _mm256_load_si256((__m256i*)b); + + mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); + x1 = _mm256_srli_si256(x, 1); + x1 = _mm256_and_si256(x1, mult1); + x2 = _mm256_and_si256(x, mult1); + + y1 = _mm256_srli_si256(y, 1); + y1 = _mm256_and_si256(y1, mult1); + y2 = _mm256_and_si256(y, mult1); + + x1_mult_y1 = _mm256_mullo_epi16(x1, y1); + x2_mult_y2 = _mm256_mullo_epi16(x2, y2); + + tmp = _mm256_and_si256(x1_mult_y1, mult1); + tmp1 = _mm256_slli_si256(tmp, 1); + tmp2 = _mm256_and_si256(x2_mult_y2, mult1); + totalc = _mm256_or_si256(tmp1, tmp2); + + _mm256_store_si256((__m256i*)c, totalc); + + a += 32; + b += 32; + c += 32; + } + + for (i = avx2_iters * 32; i < num_points ; ++i) + { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_SSE3 */ + + #ifdef LV_HAVE_ORC extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);