avx2: Adds support for AVX2 intrinsic to some kernels

Adds AVX2 intrinsic to several kernels ranging from multiply and add to more detailed operations such as convert. New peotokernels also may take advantage of the fused multiply add (fma)
2025-05-05 00:44:11 +00:00 · 2017-08-08 08:19:42 -07:00 · 2017-08-08 08:19:42 -07:00 · f4c221609c
commit f4c221609c
parent 6e65705b47
10 changed files with 1001 additions and 5 deletions
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h
@ -164,6 +164,58 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
 }
 #endif /* LV_HAVE_SSE */

+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 8;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+    float aux;
+    unsigned int i;
+    const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
+    const float max_val = (float)SHRT_MAX;
+
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for(i = 0; i < avx2_iters; i++)
+        {
+            inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
+
+            // Clip
+            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+            intInputVal1 = _mm256_cvtps_epi32(ret1);
+            intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+            _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+            outputVectorPtr += 16;
+        }
+
+    for(i = avx2_iters * 16; i < num_points * 2; i++)
+        {
+            aux = *inputVectorPtr++;
+            if(aux > max_val)
+                aux = max_val;
+            else if(aux < min_val)
+                aux = min_val;
+            *outputVectorPtr++ = (int16_t)rintf(aux);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+

 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
@ -269,6 +321,59 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
 #endif /* LV_HAVE_SSE */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 8;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+    float aux;
+    unsigned int i;
+    const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
+    const float max_val = (float)SHRT_MAX;
+
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for(i = 0; i < avx2_iters; i++)
+        {
+            inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
+
+            // Clip
+            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+            intInputVal1 = _mm256_cvtps_epi32(ret1);
+            intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+            _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+            outputVectorPtr += 16;
+        }
+
+    for(i = avx2_iters * 16; i < num_points * 2; i++)
+        {
+            aux = *inputVectorPtr++;
+            if(aux > max_val)
+                aux = max_val;
+            else if(aux < min_val)
+                aux = min_val;
+            *outputVectorPtr++ = (int16_t)rintf(aux);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
 #ifdef LV_HAVE_NEON
 #include <arm_neon.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h
@ -85,6 +85,146 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
 #endif /* LV_HAVE_GENERIC */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 16;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int8_t* outputVectorPtr = (int8_t*)outputVector;
+
+    const float min_val = (float)SCHAR_MIN;
+    const float max_val = (float)SCHAR_MAX;
+    float aux;
+    unsigned int i;
+
+    __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m256i int8InputVal;
+    __m256 ret1, ret2, ret3, ret4;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for(i = 0; i < avx2_iters; i++)
+        {
+            inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
+
+            inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
+            inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
+            inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
+            inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
+
+            // Clip
+            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+            ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
+            ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
+
+            intInputVal1 = _mm256_cvtps_epi32(ret1);
+            intInputVal2 = _mm256_cvtps_epi32(ret2);
+            intInputVal3 = _mm256_cvtps_epi32(ret3);
+            intInputVal4 = _mm256_cvtps_epi32(ret4);
+
+            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+            intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+            intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
+            int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
+            int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
+
+            _mm256_storeu_si256((__m256i*)outputVectorPtr, int8InputVal);
+            outputVectorPtr += 32;
+        }
+
+    for(i = avx2_iters * 32; i < num_points * 2; i++)
+        {
+            aux = *inputVectorPtr++ * max_val;
+            if(aux > max_val)
+                aux = max_val;
+            else if(aux < min_val)
+                aux = min_val;
+            *outputVectorPtr++ = (int8_t)rintf(aux);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 16;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int8_t* outputVectorPtr = (int8_t*)outputVector;
+
+    const float min_val = (float)SCHAR_MIN;
+    const float max_val = (float)SCHAR_MAX;
+    float aux;
+    unsigned int i;
+
+    __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m256i int8InputVal;
+    __m256 ret1, ret2, ret3, ret4;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for(i = 0; i < avx2_iters; i++)
+        {
+            inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
+
+            inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
+            inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
+            inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
+            inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
+
+            // Clip
+            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+            ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
+            ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
+
+            intInputVal1 = _mm256_cvtps_epi32(ret1);
+            intInputVal2 = _mm256_cvtps_epi32(ret2);
+            intInputVal3 = _mm256_cvtps_epi32(ret3);
+            intInputVal4 = _mm256_cvtps_epi32(ret4);
+
+            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+            intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+            intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
+            int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
+            int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
+
+            _mm256_store_si256((__m256i*)outputVectorPtr, int8InputVal);
+            outputVectorPtr += 32;
+        }
+
+    for(i = avx2_iters * 32; i < num_points * 2; i++)
+        {
+            aux = *inputVectorPtr++ * max_val;
+            if(aux > max_val)
+                aux = max_val;
+            else if(aux < min_val)
+                aux = min_val;
+            *outputVectorPtr++ = (int8_t)rintf(aux);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
 }

 #endif /* LV_HAVE_GENERIC */
- 
+

 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
@ -248,6 +248,64 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
 #endif


+#ifdef LV_HAVE_AVX2
+static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
+{
+    float code_phase_step_chips = -0.6;
+    int code_length_chips = 1023;
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    lv_32fc_t** result_aux =  (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+#endif
+
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
+{
+    float code_phase_step_chips = -0.6;
+    int code_length_chips = 1023;
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    lv_32fc_t** result_aux =  (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+#endif
+
+
 #ifdef LV_HAVE_NEON
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h
@ -160,7 +160,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
        }
 }

-#endif 
+#endif


 #ifdef LV_HAVE_SSE3
@ -295,7 +295,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
        }
 }

-#endif 
+#endif


 #ifdef LV_HAVE_SSE4_1
@ -518,6 +518,162 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
 #endif


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    lv_32fc_t** _result = result;
+    const unsigned int avx_iters = num_points / 8;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m256 eights = _mm256_set1_ps(8.0f);
+    const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
+    const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
+    int local_code_chip_index_;
+
+    const __m256 zeros = _mm256_setzero_ps();
+    const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
+    const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+
+    __m256i local_code_chip_index_reg, i;
+    __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < avx_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
+                    aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
+                    //aux = _mm256_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm256_floor_ps(aux);
+
+                    // fmod
+                    c = _mm256_div_ps(aux, code_length_chips_reg_f);
+                    //_mm_fmsub_ps(c, code_length_chips_reg_f, aux)
+                    i = _mm256_cvttps_epi32(c);
+                    cTrunc = _mm256_cvtepi32_ps(i);
+                    base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(base);
+
+                    // no negatives
+                    c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
+                    negatives = _mm256_cmp_ps(c, zeros, 0x01 );
+                    aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
+                    aux = _mm256_add_ps(c, aux3);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
+
+                    _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 8; ++k)
+                        {
+                            _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm256_add_ps(indexn, eights);
+                }
+        }
+    _mm256_zeroupper();
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for(n = avx_iters * 8; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    lv_32fc_t** _result = result;
+    const unsigned int avx_iters = num_points / 8;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m256 eights = _mm256_set1_ps(8.0f);
+    const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
+    const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
+    int local_code_chip_index_;
+
+    const __m256 zeros = _mm256_setzero_ps();
+    const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
+    const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+
+    __m256i local_code_chip_index_reg, i;
+    __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < avx_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
+                    aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
+                    //aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
+                    //aux = _mm256_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm256_floor_ps(aux);
+
+                    // fmod
+                    c = _mm256_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm256_cvttps_epi32(c);
+                    cTrunc = _mm256_cvtepi32_ps(i);
+                    base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(base);
+
+                    // no negatives
+                    c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
+                    negatives = _mm256_cmp_ps(c, zeros, 0x01 );
+                    aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
+                    aux = _mm256_add_ps(c, aux3);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
+
+                    _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 8; ++k)
+                        {
+                            _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm256_add_ps(indexn, eights);
+                }
+        }
+    _mm256_zeroupper();
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for(n = avx_iters * 8; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
 #ifdef LV_HAVE_NEON
 #include <arm_neon.h>

@ -604,4 +760,3 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul


 #endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/
-
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h
@ -152,6 +152,83 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
 #endif /* LV_HAVE_SSE3 */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const char* inputBuffer, unsigned int num_points)
+{
+    char returnValue = 0;
+    const unsigned int sse_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+
+    const char* aPtr = inputBuffer;
+
+    __VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
+    __m256i accumulator = _mm256_setzero_si256();
+    __m256i aVal = _mm256_setzero_si256();
+
+    for(number = 0; number < sse_iters; number++)
+        {
+            aVal = _mm256_load_si256((__m256i*)aPtr);
+            accumulator = _mm256_add_epi8(accumulator, aVal);
+            aPtr += 32;
+        }
+    _mm256_store_si256((__m256i*)tempBuffer,accumulator);
+
+    for(i = 0; i < 32; ++i)
+        {
+            returnValue += tempBuffer[i];
+        }
+
+    for(i = 0; i < (num_points % 32); ++i)
+        {
+            returnValue += (*aPtr++);
+        }
+
+    *result = returnValue;
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <pmmintrin.h>
+
+static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const char* inputBuffer, unsigned int num_points)
+{
+    char returnValue = 0;
+    const unsigned int sse_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+    const char* aPtr = inputBuffer;
+
+    __VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
+    __m256i accumulator = _mm256_setzero_si256();
+    __m256i aVal = _mm256_setzero_si256();
+
+    for(number = 0; number < sse_iters; number++)
+        {
+            aVal = _mm256_lddqu_si256((__m256i*)aPtr);
+            accumulator = _mm256_add_epi8(accumulator, aVal);
+            aPtr += 32;
+        }
+    _mm256_storeu_si256((__m256i*)tempBuffer, accumulator);
+
+    for(i = 0; i < 32; ++i)
+        {
+            returnValue += tempBuffer[i];
+        }
+
+    for(i = 0; i < (num_points % 32); ++i)
+        {
+            returnValue += (*aPtr++);
+        }
+
+    *result = returnValue;
+}
+#endif /* LV_HAVE_SSE3 */
+
+
 #ifdef LV_HAVE_ORC

 extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
@ -169,4 +246,3 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const cha
 #endif /* LV_HAVE_ORC */

 #endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_H */
-
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h
@ -58,6 +58,71 @@

 #include <volk_gnsssdr/volk_gnsssdr_common.h>

+
+#ifdef LV_HAVE_AVX2
+#include<immintrin.h>
+
+static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points)
+{
+    if(num_points > 0)
+        {
+            const unsigned int avx2_iters = num_points / 32;
+            unsigned int number;
+            unsigned int i;
+            char* basePtr = (char*)src0;
+            char* inputPtr = (char*)src0;
+            char max = src0[0];
+            unsigned int index = 0;
+            unsigned int mask;
+            __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
+            __m256i maxValues, compareResults, currentValues;
+
+            maxValues = _mm256_set1_epi8(max);
+
+            for(number = 0; number < avx2_iters; number++)
+                {
+                    currentValues  = _mm256_loadu_si256((__m256i*)inputPtr);
+                    compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
+                    mask = _mm256_movemask_epi8(compareResults);
+
+                    if (mask != 0xFFFFFFFF)
+                        {
+                            _mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
+                            mask = ~mask;
+                            i = 0;
+                            while (mask > 0)
+                                {
+                                    if ((mask & 1) == 1)
+                                        {
+                                            if(currentValuesBuffer[i] > max)
+                                                {
+                                                    index = inputPtr - basePtr + i;
+                                                    max = currentValuesBuffer[i];
+                                                }
+                                        }
+                                    i++;
+                                    mask >>= 1;
+                                }
+                            maxValues = _mm256_set1_epi8(max);
+                        }
+                    inputPtr += 32;
+                }
+
+            for(i = 0; i<(num_points % 32); ++i)
+                {
+                    if(src0[i] > max)
+                        {
+                            index = i;
+                            max = src0[i];
+                        }
+                }
+            target[0] = index;
+        }
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+
 #ifdef LV_HAVE_AVX
 #include <immintrin.h>

@ -271,6 +336,70 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c
 #endif /*LV_HAVE_GENERIC*/


+#ifdef LV_HAVE_AVX2
+#include<immintrin.h>
+
+static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points)
+{
+    if(num_points > 0)
+        {
+            const unsigned int avx2_iters = num_points / 32;
+            unsigned int number;
+            unsigned int i;
+            char* basePtr = (char*)src0;
+            char* inputPtr = (char*)src0;
+            char max = src0[0];
+            unsigned int index = 0;
+            unsigned int mask;
+            __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
+            __m256i maxValues, compareResults, currentValues;
+
+            maxValues = _mm256_set1_epi8(max);
+
+            for(number = 0; number < avx2_iters; number++)
+                {
+                    currentValues  = _mm256_load_si256((__m256i*)inputPtr);
+                    compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
+                    mask = _mm256_movemask_epi8(compareResults);
+
+                    if (mask != 0xFFFFFFFF)
+                        {
+                            _mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
+                            mask = ~mask;
+                            i = 0;
+                            while (mask > 0)
+                                {
+                                    if ((mask & 1) == 1)
+                                        {
+                                            if(currentValuesBuffer[i] > max)
+                                                {
+                                                    index = inputPtr - basePtr + i;
+                                                    max = currentValuesBuffer[i];
+                                                }
+                                        }
+                                    i++;
+                                    mask >>= 1;
+                                }
+                            maxValues = _mm256_set1_epi8(max);
+                        }
+                    inputPtr += 32;
+                }
+
+            for(i = 0; i<(num_points % 32); ++i)
+                {
+                    if(src0[i] > max)
+                        {
+                            index = i;
+                            max = src0[i];
+                        }
+                }
+            target[0] = index;
+        }
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+
 #ifdef LV_HAVE_AVX
 #include <immintrin.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h
@ -58,6 +58,55 @@

 #include <volk_gnsssdr/volk_gnsssdr_common.h>

+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points)
+{
+    if(num_points > 0)
+        {
+            const unsigned int avx_iters = num_points / 32;
+            unsigned int number;
+            unsigned int i;
+            char* inputPtr = (char*)src0;
+            char max = src0[0];
+            __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
+            __m256i maxValues, compareResults, currentValues;
+
+            maxValues = _mm256_set1_epi8(max);
+
+            for(number = 0; number < avx_iters; number++)
+                {
+                    currentValues  = _mm256_loadu_si256((__m256i*)inputPtr);
+                    compareResults = _mm256_max_epi8(maxValues, currentValues);
+                    maxValues = compareResults;
+                    inputPtr += 32;
+                }
+
+            _mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues);
+
+            for(i = 0; i < 32; ++i)
+                {
+                    if(maxValuesBuffer[i] > max)
+                        {
+                            max = maxValuesBuffer[i];
+                        }
+                }
+
+            for(i = avx_iters * 32; i < num_points; ++i)
+                {
+                    if(src0[i] > max)
+                        {
+                            max = src0[i];
+                        }
+                }
+            target[0] = max;
+        }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>

@ -238,6 +287,55 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
 #endif /*LV_HAVE_SSE4_1*/


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points)
+{
+    if(num_points > 0)
+        {
+            const unsigned int avx_iters = num_points / 32;
+            unsigned int number;
+            unsigned int i;
+            char* inputPtr = (char*)src0;
+            char max = src0[0];
+            __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
+            __m256i maxValues, compareResults, currentValues;
+
+            maxValues = _mm256_set1_epi8(max);
+
+            for(number = 0; number < avx_iters; number++)
+                {
+                    currentValues  = _mm256_load_si256((__m256i*)inputPtr);
+                    compareResults = _mm256_max_epi8(maxValues, currentValues);
+                    maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
+                    inputPtr += 32;
+                }
+
+            _mm256_store_si256((__m256i*)maxValuesBuffer, maxValues);
+
+            for(i = 0; i < 32; ++i)
+                {
+                    if(maxValuesBuffer[i] > max)
+                        {
+                            max = maxValuesBuffer[i];
+                        }
+                }
+
+            for(i = avx_iters * 32; i < num_points; ++i)
+                {
+                    if(src0[i] > max)
+                        {
+                            max = src0[i];
+                        }
+                }
+            target[0] = max;
+        }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h
@ -94,6 +94,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
 #endif /* LV_HAVE_SSE2 */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
+{
+    const unsigned int avx_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+    char* cPtr = cVector;
+    const char* aPtr = aVector;
+    const char* bPtr = bVector;
+
+    __m256i aVal, bVal, cVal;
+
+    for(number = 0; number < avx_iters; number++)
+        {
+            aVal = _mm256_loadu_si256((__m256i*)aPtr);
+            bVal = _mm256_loadu_si256((__m256i*)bPtr);
+
+            cVal = _mm256_add_epi8(aVal, bVal);
+
+            _mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
+
+            aPtr += 32;
+            bPtr += 32;
+            cPtr += 32;
+        }
+
+    for(i = avx_iters * 32; i < num_points; ++i)
+        {
+            *cPtr++ = (*aPtr++) + (*bPtr++);
+        }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
 #ifdef LV_HAVE_GENERIC

 static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
@ -147,6 +183,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
 #endif /* LV_HAVE_SSE2 */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
+{
+    const unsigned int avx_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+    char* cPtr = cVector;
+    const char* aPtr = aVector;
+    const char* bPtr = bVector;
+
+    __m256i aVal, bVal, cVal;
+
+    for(number = 0; number < avx_iters; number++)
+        {
+            aVal = _mm256_load_si256((__m256i*)aPtr);
+            bVal = _mm256_load_si256((__m256i*)bPtr);
+
+            cVal = _mm256_add_epi8(aVal, bVal);
+
+            _mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
+
+            aPtr += 32;
+            bPtr += 32;
+            cPtr += 32;
+        }
+
+    for(i = avx_iters * 32; i < num_points; ++i)
+        {
+            *cPtr++ = (*aPtr++) + (*bPtr++);
+        }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
 #ifdef LV_HAVE_ORC

 extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h
@ -59,6 +59,37 @@

 #include <volk_gnsssdr/volk_gnsssdr_complex.h>

+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 16;
+    unsigned int i;
+    lv_8sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+
+    __m256i tmp;
+    __m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
+
+    for (i = 0; i < avx2_iters; ++i)
+        {
+            tmp = _mm256_loadu_si256((__m256i*)a);
+            tmp = _mm256_sign_epi8(tmp, conjugator);
+            _mm256_storeu_si256((__m256i*)c, tmp);
+
+            a += 16;
+            c += 16;
+        }
+
+    for (i = avx2_iters * 16; i < num_points; ++i)
+        {
+            *c++ = lv_conj(*a++);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
 #ifdef LV_HAVE_AVX
 #include <immintrin.h>

@ -217,6 +248,37 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
 #endif /* LV_HAVE_AVX */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 16;
+    unsigned int i;
+    lv_8sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+
+    __m256i tmp;
+    __m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
+
+    for (i = 0; i < avx2_iters; ++i)
+        {
+            tmp = _mm256_load_si256((__m256i*)a);
+            tmp = _mm256_sign_epi8(tmp, conjugator);
+            _mm256_store_si256((__m256i*)c, tmp);
+
+            a += 16;
+            c += 16;
+        }
+
+    for (i = avx2_iters * 16; i < num_points; ++i)
+        {
+            *c++ = lv_conj(*a++);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
 #ifdef LV_HAVE_SSSE3
 #include <tmmintrin.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h
@ -58,6 +58,56 @@
 #ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H
 #define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H

+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+
+    __m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
+    unsigned char* c = cChar;
+    const unsigned char* a = aChar;
+    const unsigned char* b = bChar;
+
+    for(number = 0; number < avx2_iters; number++)
+        {
+            x = _mm256_loadu_si256((__m256i*)a);
+            y = _mm256_loadu_si256((__m256i*)b);
+
+            mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
+            x1 = _mm256_srli_si256(x, 1);
+            x1 = _mm256_and_si256(x1, mult1);
+            x2 = _mm256_and_si256(x, mult1);
+
+            y1 = _mm256_srli_si256(y, 1);
+            y1 = _mm256_and_si256(y1, mult1);
+            y2 = _mm256_and_si256(y, mult1);
+
+            x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
+            x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
+
+            tmp = _mm256_and_si256(x1_mult_y1, mult1);
+            tmp1 = _mm256_slli_si256(tmp, 1);
+            tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
+            totalc = _mm256_or_si256(tmp1, tmp2);
+
+            _mm256_storeu_si256((__m256i*)c, totalc);
+
+            a += 32;
+            b += 32;
+            c += 32;
+        }
+
+    for (i = avx2_iters * 32; i < num_points ; ++i)
+        {
+            *c++ = (*a++) * (*b++);
+        }
+}
+#endif /* LV_HAVE_SSE3 */
+

 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
@ -176,6 +226,57 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
 #endif /* LV_HAVE_SSE */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+
+    __m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
+    unsigned char* c = cChar;
+    const unsigned char* a = aChar;
+    const unsigned char* b = bChar;
+
+    for(number = 0; number < avx2_iters; number++)
+        {
+            x = _mm256_load_si256((__m256i*)a);
+            y = _mm256_load_si256((__m256i*)b);
+
+            mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
+            x1 = _mm256_srli_si256(x, 1);
+            x1 = _mm256_and_si256(x1, mult1);
+            x2 = _mm256_and_si256(x, mult1);
+
+            y1 = _mm256_srli_si256(y, 1);
+            y1 = _mm256_and_si256(y1, mult1);
+            y2 = _mm256_and_si256(y, mult1);
+
+            x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
+            x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
+
+            tmp = _mm256_and_si256(x1_mult_y1, mult1);
+            tmp1 = _mm256_slli_si256(tmp, 1);
+            tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
+            totalc = _mm256_or_si256(tmp1, tmp2);
+
+            _mm256_store_si256((__m256i*)c, totalc);
+
+            a += 32;
+            b += 32;
+            c += 32;
+        }
+
+    for (i = avx2_iters * 32; i < num_points ; ++i)
+        {
+            *c++ = (*a++) * (*b++);
+        }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
 #ifdef LV_HAVE_ORC

 extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);