Merge branch 'next' of https://github.com/gnss-sdr/gnss-sdr into next

2025-08-08 23:13:54 +00:00 · 2017-08-21 12:06:03 +02:00 · 2017-08-21 12:06:03 +02:00 · 265caeda33
commit 265caeda33
parent 122ec19626 f4c221609c
10 changed files with 1001 additions and 5 deletions
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h
@ -164,6 +164,58 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
 }
 #endif /* LV_HAVE_SSE */

+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 8;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+    float aux;
+    unsigned int i;
+    const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
+    const float max_val = (float)SHRT_MAX;
+
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for(i = 0; i < avx2_iters; i++)
+        {
+            inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
+
+            // Clip
+            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+            intInputVal1 = _mm256_cvtps_epi32(ret1);
+            intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+            _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
+            outputVectorPtr += 16;
+        }
+
+    for(i = avx2_iters * 16; i < num_points * 2; i++)
+        {
+            aux = *inputVectorPtr++;
+            if(aux > max_val)
+                aux = max_val;
+            else if(aux < min_val)
+                aux = min_val;
+            *outputVectorPtr++ = (int16_t)rintf(aux);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+

 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
@ -269,6 +321,59 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
 #endif /* LV_HAVE_SSE */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 8;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int16_t* outputVectorPtr = (int16_t*)outputVector;
+    float aux;
+    unsigned int i;
+    const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
+    const float max_val = (float)SHRT_MAX;
+
+    __m256 inputVal1, inputVal2;
+    __m256i intInputVal1, intInputVal2;
+    __m256 ret1, ret2;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for(i = 0; i < avx2_iters; i++)
+        {
+            inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
+
+            // Clip
+            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+
+            intInputVal1 = _mm256_cvtps_epi32(ret1);
+            intInputVal2 = _mm256_cvtps_epi32(ret2);
+
+            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+
+            _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
+            outputVectorPtr += 16;
+        }
+
+    for(i = avx2_iters * 16; i < num_points * 2; i++)
+        {
+            aux = *inputVectorPtr++;
+            if(aux > max_val)
+                aux = max_val;
+            else if(aux < min_val)
+                aux = min_val;
+            *outputVectorPtr++ = (int16_t)rintf(aux);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
 #ifdef LV_HAVE_NEON
 #include <arm_neon.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h
@ -85,6 +85,146 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
 #endif /* LV_HAVE_GENERIC */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 16;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int8_t* outputVectorPtr = (int8_t*)outputVector;
+
+    const float min_val = (float)SCHAR_MIN;
+    const float max_val = (float)SCHAR_MAX;
+    float aux;
+    unsigned int i;
+
+    __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m256i int8InputVal;
+    __m256 ret1, ret2, ret3, ret4;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for(i = 0; i < avx2_iters; i++)
+        {
+            inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
+
+            inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
+            inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
+            inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
+            inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
+
+            // Clip
+            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+            ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
+            ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
+
+            intInputVal1 = _mm256_cvtps_epi32(ret1);
+            intInputVal2 = _mm256_cvtps_epi32(ret2);
+            intInputVal3 = _mm256_cvtps_epi32(ret3);
+            intInputVal4 = _mm256_cvtps_epi32(ret4);
+
+            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+            intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+            intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
+            int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
+            int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
+
+            _mm256_storeu_si256((__m256i*)outputVectorPtr, int8InputVal);
+            outputVectorPtr += 32;
+        }
+
+    for(i = avx2_iters * 32; i < num_points * 2; i++)
+        {
+            aux = *inputVectorPtr++ * max_val;
+            if(aux > max_val)
+                aux = max_val;
+            else if(aux < min_val)
+                aux = min_val;
+            *outputVectorPtr++ = (int8_t)rintf(aux);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 16;
+
+    float* inputVectorPtr = (float*)inputVector;
+    int8_t* outputVectorPtr = (int8_t*)outputVector;
+
+    const float min_val = (float)SCHAR_MIN;
+    const float max_val = (float)SCHAR_MAX;
+    float aux;
+    unsigned int i;
+
+    __m256 inputVal1, inputVal2, inputVal3, inputVal4;
+    __m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
+    __m256i int8InputVal;
+    __m256 ret1, ret2, ret3, ret4;
+    const __m256 vmin_val = _mm256_set1_ps(min_val);
+    const __m256 vmax_val = _mm256_set1_ps(max_val);
+
+    for(i = 0; i < avx2_iters; i++)
+        {
+            inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
+            __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
+
+            inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
+            inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
+            inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
+            inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
+
+            // Clip
+            ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
+            ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
+            ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
+            ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
+
+            intInputVal1 = _mm256_cvtps_epi32(ret1);
+            intInputVal2 = _mm256_cvtps_epi32(ret2);
+            intInputVal3 = _mm256_cvtps_epi32(ret3);
+            intInputVal4 = _mm256_cvtps_epi32(ret4);
+
+            intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
+            intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
+            intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
+            intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
+            int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
+            int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
+
+            _mm256_store_si256((__m256i*)outputVectorPtr, int8InputVal);
+            outputVectorPtr += 32;
+        }
+
+    for(i = avx2_iters * 32; i < num_points * 2; i++)
+        {
+            aux = *inputVectorPtr++ * max_val;
+            if(aux > max_val)
+                aux = max_val;
+            else if(aux < min_val)
+                aux = min_val;
+            *outputVectorPtr++ = (int8_t)rintf(aux);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
 }

 #endif /* LV_HAVE_GENERIC */
- 
+

 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
@ -248,6 +248,64 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
 #endif


+#ifdef LV_HAVE_AVX2
+static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
+{
+    float code_phase_step_chips = -0.6;
+    int code_length_chips = 1023;
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    lv_32fc_t** result_aux =  (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+#endif
+
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
+{
+    float code_phase_step_chips = -0.6;
+    int code_length_chips = 1023;
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    lv_32fc_t** result_aux =  (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+#endif
+
+
 #ifdef LV_HAVE_NEON
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h
@ -160,7 +160,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
        }
 }

-#endif 
+#endif


 #ifdef LV_HAVE_SSE3
@ -295,7 +295,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
        }
 }

-#endif 
+#endif


 #ifdef LV_HAVE_SSE4_1
@ -518,6 +518,162 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
 #endif


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    lv_32fc_t** _result = result;
+    const unsigned int avx_iters = num_points / 8;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m256 eights = _mm256_set1_ps(8.0f);
+    const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
+    const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
+    int local_code_chip_index_;
+
+    const __m256 zeros = _mm256_setzero_ps();
+    const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
+    const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+
+    __m256i local_code_chip_index_reg, i;
+    __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < avx_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
+                    aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
+                    //aux = _mm256_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm256_floor_ps(aux);
+
+                    // fmod
+                    c = _mm256_div_ps(aux, code_length_chips_reg_f);
+                    //_mm_fmsub_ps(c, code_length_chips_reg_f, aux)
+                    i = _mm256_cvttps_epi32(c);
+                    cTrunc = _mm256_cvtepi32_ps(i);
+                    base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(base);
+
+                    // no negatives
+                    c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
+                    negatives = _mm256_cmp_ps(c, zeros, 0x01 );
+                    aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
+                    aux = _mm256_add_ps(c, aux3);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
+
+                    _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 8; ++k)
+                        {
+                            _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm256_add_ps(indexn, eights);
+                }
+        }
+    _mm256_zeroupper();
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for(n = avx_iters * 8; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    lv_32fc_t** _result = result;
+    const unsigned int avx_iters = num_points / 8;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m256 eights = _mm256_set1_ps(8.0f);
+    const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
+    const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
+    int local_code_chip_index_;
+
+    const __m256 zeros = _mm256_setzero_ps();
+    const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
+    const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+
+    __m256i local_code_chip_index_reg, i;
+    __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < avx_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
+                    aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
+                    //aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
+                    //aux = _mm256_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm256_floor_ps(aux);
+
+                    // fmod
+                    c = _mm256_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm256_cvttps_epi32(c);
+                    cTrunc = _mm256_cvtepi32_ps(i);
+                    base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(base);
+
+                    // no negatives
+                    c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
+                    negatives = _mm256_cmp_ps(c, zeros, 0x01 );
+                    aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
+                    aux = _mm256_add_ps(c, aux3);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
+
+                    _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 8; ++k)
+                        {
+                            _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm256_add_ps(indexn, eights);
+                }
+        }
+    _mm256_zeroupper();
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for(n = avx_iters * 8; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
 #ifdef LV_HAVE_NEON
 #include <arm_neon.h>

@ -604,4 +760,3 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul


 #endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/
-
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h
@ -152,6 +152,83 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
 #endif /* LV_HAVE_SSE3 */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const char* inputBuffer, unsigned int num_points)
+{
+    char returnValue = 0;
+    const unsigned int sse_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+
+    const char* aPtr = inputBuffer;
+
+    __VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
+    __m256i accumulator = _mm256_setzero_si256();
+    __m256i aVal = _mm256_setzero_si256();
+
+    for(number = 0; number < sse_iters; number++)
+        {
+            aVal = _mm256_load_si256((__m256i*)aPtr);
+            accumulator = _mm256_add_epi8(accumulator, aVal);
+            aPtr += 32;
+        }
+    _mm256_store_si256((__m256i*)tempBuffer,accumulator);
+
+    for(i = 0; i < 32; ++i)
+        {
+            returnValue += tempBuffer[i];
+        }
+
+    for(i = 0; i < (num_points % 32); ++i)
+        {
+            returnValue += (*aPtr++);
+        }
+
+    *result = returnValue;
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <pmmintrin.h>
+
+static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const char* inputBuffer, unsigned int num_points)
+{
+    char returnValue = 0;
+    const unsigned int sse_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+    const char* aPtr = inputBuffer;
+
+    __VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
+    __m256i accumulator = _mm256_setzero_si256();
+    __m256i aVal = _mm256_setzero_si256();
+
+    for(number = 0; number < sse_iters; number++)
+        {
+            aVal = _mm256_lddqu_si256((__m256i*)aPtr);
+            accumulator = _mm256_add_epi8(accumulator, aVal);
+            aPtr += 32;
+        }
+    _mm256_storeu_si256((__m256i*)tempBuffer, accumulator);
+
+    for(i = 0; i < 32; ++i)
+        {
+            returnValue += tempBuffer[i];
+        }
+
+    for(i = 0; i < (num_points % 32); ++i)
+        {
+            returnValue += (*aPtr++);
+        }
+
+    *result = returnValue;
+}
+#endif /* LV_HAVE_SSE3 */
+
+
 #ifdef LV_HAVE_ORC

 extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
@ -169,4 +246,3 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const cha
 #endif /* LV_HAVE_ORC */

 #endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_H */
-
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h
@ -58,6 +58,71 @@

 #include <volk_gnsssdr/volk_gnsssdr_common.h>

+
+#ifdef LV_HAVE_AVX2
+#include<immintrin.h>
+
+static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points)
+{
+    if(num_points > 0)
+        {
+            const unsigned int avx2_iters = num_points / 32;
+            unsigned int number;
+            unsigned int i;
+            char* basePtr = (char*)src0;
+            char* inputPtr = (char*)src0;
+            char max = src0[0];
+            unsigned int index = 0;
+            unsigned int mask;
+            __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
+            __m256i maxValues, compareResults, currentValues;
+
+            maxValues = _mm256_set1_epi8(max);
+
+            for(number = 0; number < avx2_iters; number++)
+                {
+                    currentValues  = _mm256_loadu_si256((__m256i*)inputPtr);
+                    compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
+                    mask = _mm256_movemask_epi8(compareResults);
+
+                    if (mask != 0xFFFFFFFF)
+                        {
+                            _mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
+                            mask = ~mask;
+                            i = 0;
+                            while (mask > 0)
+                                {
+                                    if ((mask & 1) == 1)
+                                        {
+                                            if(currentValuesBuffer[i] > max)
+                                                {
+                                                    index = inputPtr - basePtr + i;
+                                                    max = currentValuesBuffer[i];
+                                                }
+                                        }
+                                    i++;
+                                    mask >>= 1;
+                                }
+                            maxValues = _mm256_set1_epi8(max);
+                        }
+                    inputPtr += 32;
+                }
+
+            for(i = 0; i<(num_points % 32); ++i)
+                {
+                    if(src0[i] > max)
+                        {
+                            index = i;
+                            max = src0[i];
+                        }
+                }
+            target[0] = index;
+        }
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+
 #ifdef LV_HAVE_AVX
 #include <immintrin.h>

@ -271,6 +336,70 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c
 #endif /*LV_HAVE_GENERIC*/


+#ifdef LV_HAVE_AVX2
+#include<immintrin.h>
+
+static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points)
+{
+    if(num_points > 0)
+        {
+            const unsigned int avx2_iters = num_points / 32;
+            unsigned int number;
+            unsigned int i;
+            char* basePtr = (char*)src0;
+            char* inputPtr = (char*)src0;
+            char max = src0[0];
+            unsigned int index = 0;
+            unsigned int mask;
+            __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
+            __m256i maxValues, compareResults, currentValues;
+
+            maxValues = _mm256_set1_epi8(max);
+
+            for(number = 0; number < avx2_iters; number++)
+                {
+                    currentValues  = _mm256_load_si256((__m256i*)inputPtr);
+                    compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
+                    mask = _mm256_movemask_epi8(compareResults);
+
+                    if (mask != 0xFFFFFFFF)
+                        {
+                            _mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
+                            mask = ~mask;
+                            i = 0;
+                            while (mask > 0)
+                                {
+                                    if ((mask & 1) == 1)
+                                        {
+                                            if(currentValuesBuffer[i] > max)
+                                                {
+                                                    index = inputPtr - basePtr + i;
+                                                    max = currentValuesBuffer[i];
+                                                }
+                                        }
+                                    i++;
+                                    mask >>= 1;
+                                }
+                            maxValues = _mm256_set1_epi8(max);
+                        }
+                    inputPtr += 32;
+                }
+
+            for(i = 0; i<(num_points % 32); ++i)
+                {
+                    if(src0[i] > max)
+                        {
+                            index = i;
+                            max = src0[i];
+                        }
+                }
+            target[0] = index;
+        }
+}
+
+#endif /*LV_HAVE_AVX2*/
+
+
 #ifdef LV_HAVE_AVX
 #include <immintrin.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h
@ -58,6 +58,55 @@

 #include <volk_gnsssdr/volk_gnsssdr_common.h>

+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points)
+{
+    if(num_points > 0)
+        {
+            const unsigned int avx_iters = num_points / 32;
+            unsigned int number;
+            unsigned int i;
+            char* inputPtr = (char*)src0;
+            char max = src0[0];
+            __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
+            __m256i maxValues, compareResults, currentValues;
+
+            maxValues = _mm256_set1_epi8(max);
+
+            for(number = 0; number < avx_iters; number++)
+                {
+                    currentValues  = _mm256_loadu_si256((__m256i*)inputPtr);
+                    compareResults = _mm256_max_epi8(maxValues, currentValues);
+                    maxValues = compareResults;
+                    inputPtr += 32;
+                }
+
+            _mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues);
+
+            for(i = 0; i < 32; ++i)
+                {
+                    if(maxValuesBuffer[i] > max)
+                        {
+                            max = maxValuesBuffer[i];
+                        }
+                }
+
+            for(i = avx_iters * 32; i < num_points; ++i)
+                {
+                    if(src0[i] > max)
+                        {
+                            max = src0[i];
+                        }
+                }
+            target[0] = max;
+        }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>

@ -238,6 +287,55 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
 #endif /*LV_HAVE_SSE4_1*/


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points)
+{
+    if(num_points > 0)
+        {
+            const unsigned int avx_iters = num_points / 32;
+            unsigned int number;
+            unsigned int i;
+            char* inputPtr = (char*)src0;
+            char max = src0[0];
+            __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
+            __m256i maxValues, compareResults, currentValues;
+
+            maxValues = _mm256_set1_epi8(max);
+
+            for(number = 0; number < avx_iters; number++)
+                {
+                    currentValues  = _mm256_load_si256((__m256i*)inputPtr);
+                    compareResults = _mm256_max_epi8(maxValues, currentValues);
+                    maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
+                    inputPtr += 32;
+                }
+
+            _mm256_store_si256((__m256i*)maxValuesBuffer, maxValues);
+
+            for(i = 0; i < 32; ++i)
+                {
+                    if(maxValuesBuffer[i] > max)
+                        {
+                            max = maxValuesBuffer[i];
+                        }
+                }
+
+            for(i = avx_iters * 32; i < num_points; ++i)
+                {
+                    if(src0[i] > max)
+                        {
+                            max = src0[i];
+                        }
+                }
+            target[0] = max;
+        }
+}
+
+#endif /*LV_HAVE_SSE4_1*/
+
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h
@ -94,6 +94,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
 #endif /* LV_HAVE_SSE2 */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
+{
+    const unsigned int avx_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+    char* cPtr = cVector;
+    const char* aPtr = aVector;
+    const char* bPtr = bVector;
+
+    __m256i aVal, bVal, cVal;
+
+    for(number = 0; number < avx_iters; number++)
+        {
+            aVal = _mm256_loadu_si256((__m256i*)aPtr);
+            bVal = _mm256_loadu_si256((__m256i*)bPtr);
+
+            cVal = _mm256_add_epi8(aVal, bVal);
+
+            _mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
+
+            aPtr += 32;
+            bPtr += 32;
+            cPtr += 32;
+        }
+
+    for(i = avx_iters * 32; i < num_points; ++i)
+        {
+            *cPtr++ = (*aPtr++) + (*bPtr++);
+        }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
 #ifdef LV_HAVE_GENERIC

 static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
@ -147,6 +183,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
 #endif /* LV_HAVE_SSE2 */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
+{
+    const unsigned int avx_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+    char* cPtr = cVector;
+    const char* aPtr = aVector;
+    const char* bPtr = bVector;
+
+    __m256i aVal, bVal, cVal;
+
+    for(number = 0; number < avx_iters; number++)
+        {
+            aVal = _mm256_load_si256((__m256i*)aPtr);
+            bVal = _mm256_load_si256((__m256i*)bPtr);
+
+            cVal = _mm256_add_epi8(aVal, bVal);
+
+            _mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
+
+            aPtr += 32;
+            bPtr += 32;
+            cPtr += 32;
+        }
+
+    for(i = avx_iters * 32; i < num_points; ++i)
+        {
+            *cPtr++ = (*aPtr++) + (*bPtr++);
+        }
+}
+#endif /* LV_HAVE_SSE2 */
+
+
 #ifdef LV_HAVE_ORC

 extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h
@ -59,6 +59,37 @@

 #include <volk_gnsssdr/volk_gnsssdr_complex.h>

+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 16;
+    unsigned int i;
+    lv_8sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+
+    __m256i tmp;
+    __m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
+
+    for (i = 0; i < avx2_iters; ++i)
+        {
+            tmp = _mm256_loadu_si256((__m256i*)a);
+            tmp = _mm256_sign_epi8(tmp, conjugator);
+            _mm256_storeu_si256((__m256i*)c, tmp);
+
+            a += 16;
+            c += 16;
+        }
+
+    for (i = avx2_iters * 16; i < num_points; ++i)
+        {
+            *c++ = lv_conj(*a++);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
 #ifdef LV_HAVE_AVX
 #include <immintrin.h>

@ -217,6 +248,37 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
 #endif /* LV_HAVE_AVX */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 16;
+    unsigned int i;
+    lv_8sc_t* c = cVector;
+    const lv_8sc_t* a = aVector;
+
+    __m256i tmp;
+    __m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
+
+    for (i = 0; i < avx2_iters; ++i)
+        {
+            tmp = _mm256_load_si256((__m256i*)a);
+            tmp = _mm256_sign_epi8(tmp, conjugator);
+            _mm256_store_si256((__m256i*)c, tmp);
+
+            a += 16;
+            c += 16;
+        }
+
+    for (i = avx2_iters * 16; i < num_points; ++i)
+        {
+            *c++ = lv_conj(*a++);
+        }
+}
+#endif /* LV_HAVE_AVX2 */
+
+
 #ifdef LV_HAVE_SSSE3
 #include <tmmintrin.h>

--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h
@ -58,6 +58,56 @@
 #ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H
 #define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H

+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+
+    __m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
+    unsigned char* c = cChar;
+    const unsigned char* a = aChar;
+    const unsigned char* b = bChar;
+
+    for(number = 0; number < avx2_iters; number++)
+        {
+            x = _mm256_loadu_si256((__m256i*)a);
+            y = _mm256_loadu_si256((__m256i*)b);
+
+            mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
+            x1 = _mm256_srli_si256(x, 1);
+            x1 = _mm256_and_si256(x1, mult1);
+            x2 = _mm256_and_si256(x, mult1);
+
+            y1 = _mm256_srli_si256(y, 1);
+            y1 = _mm256_and_si256(y1, mult1);
+            y2 = _mm256_and_si256(y, mult1);
+
+            x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
+            x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
+
+            tmp = _mm256_and_si256(x1_mult_y1, mult1);
+            tmp1 = _mm256_slli_si256(tmp, 1);
+            tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
+            totalc = _mm256_or_si256(tmp1, tmp2);
+
+            _mm256_storeu_si256((__m256i*)c, totalc);
+
+            a += 32;
+            b += 32;
+            c += 32;
+        }
+
+    for (i = avx2_iters * 32; i < num_points ; ++i)
+        {
+            *c++ = (*a++) * (*b++);
+        }
+}
+#endif /* LV_HAVE_SSE3 */
+

 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
@ -176,6 +226,57 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
 #endif /* LV_HAVE_SSE */


+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+
+static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 32;
+    unsigned int number;
+    unsigned int i;
+
+    __m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
+    unsigned char* c = cChar;
+    const unsigned char* a = aChar;
+    const unsigned char* b = bChar;
+
+    for(number = 0; number < avx2_iters; number++)
+        {
+            x = _mm256_load_si256((__m256i*)a);
+            y = _mm256_load_si256((__m256i*)b);
+
+            mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
+            x1 = _mm256_srli_si256(x, 1);
+            x1 = _mm256_and_si256(x1, mult1);
+            x2 = _mm256_and_si256(x, mult1);
+
+            y1 = _mm256_srli_si256(y, 1);
+            y1 = _mm256_and_si256(y1, mult1);
+            y2 = _mm256_and_si256(y, mult1);
+
+            x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
+            x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
+
+            tmp = _mm256_and_si256(x1_mult_y1, mult1);
+            tmp1 = _mm256_slli_si256(tmp, 1);
+            tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
+            totalc = _mm256_or_si256(tmp1, tmp2);
+
+            _mm256_store_si256((__m256i*)c, totalc);
+
+            a += 32;
+            b += 32;
+            c += 32;
+        }
+
+    for (i = avx2_iters * 32; i < num_points ; ++i)
+        {
+            *c++ = (*a++) * (*b++);
+        }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
 #ifdef LV_HAVE_ORC

 extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);