From 676c1506da92740a47a8dfc215e7cee892ede367 Mon Sep 17 00:00:00 2001
From: Cillian O'Driscoll <cillian.odriscoll@gmail.com>
Date: Mon, 11 Sep 2017 15:15:27 +0100
Subject: [PATCH 1/4] Updated volk_gnsssdr_module for real codes

Added 16i and 32f resamplers and 32fc_32f and 16ic_16i rotator dot product to
enable use of real (rather than complex) local code replicas
---
 .../volk_gnsssdr_avx_intrinsics.h             |    8 +
 .../volk_gnsssdr_16i_resamplerxnpuppet_16i.h  |  282 +++
 .../volk_gnsssdr_16i_xn_resampler_16i_xn.h    |  608 +++++++
 ...nsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h | 1600 +++++++++++++++++
 ...dr_16ic_16i_rotator_dotprodxnpuppet_16ic.h |  384 ++++
 ...volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h |   16 +-
 .../volk_gnsssdr_32f_resamplerxnpuppet_32f.h  |  279 +++
 .../volk_gnsssdr_32f_xn_resampler_32f_xn.h    |  610 +++++++
 ...nsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h |  320 ++++
 ...dr_32fc_32f_rotator_dotprodxnpuppet_32fc.h |  132 ++
 ...volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h |   41 +-
 .../volk_gnsssdr/lib/kernel_tests.h           |    4 +
 .../volk_gnsssdr/lib/qa_utils.cc              |    2 +-
 13 files changed, 4257 insertions(+), 29 deletions(-)
 create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h
 create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h
 create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h
 create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h
 create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h
 create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h
 create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h
 create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h
index b737ea164..809aa98f9 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h
@@ -61,6 +61,14 @@ _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){
   return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
 }
 
+static inline __m256 _mm256_complexnormalise_ps( __m256 z ){
+    __m256 tmp1 = _mm256_mul_ps(z, z);
+    __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
+    tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
+    tmp2 = _mm256_sqrt_ps(tmp1);
+    return _mm256_div_ps(z, tmp2);
+}
+
 static inline __m256
 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){
   return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h
new file mode 100644
index 000000000..3c1c0f817
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h
@@ -0,0 +1,282 @@
+/*!
+ * \file volk_gnsssdr_16i_resamplerxnpuppet_16i.h
+ * \brief VOLK_GNSSSDR puppet for the multiple 16-bit vector resampler kernel.
+ * \authors <ul>
+ *          <li> Cillian O'Driscoll 2017 cillian.odriscoll at gmail dot com
+ *          </ul>
+ *
+ * VOLK_GNSSSDR puppet for integrating the multiple resampler into the test system
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2017  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_16i_resamplerxnpuppet_16i_H
+#define INCLUDED_volk_gnsssdr_16i_resamplerxnpuppet_16i_H
+
+#include "volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h"
+#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <volk_gnsssdr/volk_gnsssdr.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* result, const int16_t* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    unsigned int n;
+    float rem_code_phase_chips = -0.234;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1  };
+    int16_t** result_aux =  (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_16i_xn_resampler_16i_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif /* LV_HAVE_GENERIC */
+ 
+#ifdef LV_HAVE_SSE3
+static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+    int16_t** result_aux =  (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+#ifdef LV_HAVE_SSE3
+static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+    int16_t** result_aux =  (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+
+#ifdef LV_HAVE_SSE4_1
+static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+    int16_t** result_aux =  (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+
+#ifdef LV_HAVE_SSE4_1
+static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+    int16_t** result_aux =  (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+
+#ifdef LV_HAVE_AVX
+static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+    int16_t** result_aux =  (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+
+#ifdef LV_HAVE_AVX
+static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+    int16_t** result_aux =  (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+
+#ifdef LV_HAVE_NEON
+static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+    int16_t** result_aux =  (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_16i_xn_resampler_16i_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H
+
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h
new file mode 100644
index 000000000..0d09df273
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h
@@ -0,0 +1,608 @@
+/*!
+ * \file volk_gnsssdr_16i_xn_resampler_16i_xn.h
+ * \brief VOLK_GNSSSDR kernel: Resamples N 16 bits integer short vectors using zero hold resample algorithm.
+ * \authors <ul>
+ *          <li> Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com
+ *          </ul>
+ *
+ * VOLK_GNSSSDR kernel that resamples N 16 bits integer short complex vectors using zero hold resample algorithm.
+ * It resamples a single GNSS local code signal replica into N vectors fractional-resampled and fractional-delayed
+ * (i.e. it creates the Early, Prompt, and Late code replicas)
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2017  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+/*!
+ * \page volk_gnsssdr_16i_xn_resampler_16i_xn
+ *
+ * \b Overview
+ *
+ * Resamples a complex vector (16-bit integer each component), providing \p num_out_vectors outputs.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_gnsssdr_16i_xn_resampler_16i_xn(int16_t** result, const int16_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li local_code:            Vector to be resampled.
+ * \li rem_code_phase_chips:  Remnant code phase [chips].
+ * \li code_phase_step_chips: Phase increment per sample [chips/sample].
+ * \li shifts_chips:          Vector of floats that defines the spacing (in chips) between the replicas of \p local_code
+ * \li code_length_chips:     Code length in chips.
+ * \li num_out_vectors:       Number of output vectors.
+ * \li num_points:            The number of data values to be in the resampled vector.
+ *
+ * \b Outputs
+ * \li result:                Pointer to a vector of pointers where the results will be stored.
+ *
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H
+#define INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_generic(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    int local_code_chip_index;
+    int current_correlator_tap;
+    int n;
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for (n = 0; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
+                    local_code_chip_index = local_code_chip_index % code_length_chips;
+                    result[current_correlator_tap][n] = local_code[local_code_chip_index];
+                }
+        }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    int16_t** _result = result;
+    const unsigned int quarterPoints = num_points / 4;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m128 fours = _mm_set1_ps(4.0f);
+    const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
+    const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
+    int local_code_chip_index_;
+
+    const __m128i zeros = _mm_setzero_si128();
+    const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
+    const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
+    __m128i local_code_chip_index_reg, aux_i, negatives, i;
+    __m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
+            for(n = 0; n < quarterPoints; n++)
+                {
+                    aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm_floor_ps(aux);
+
+                    // fmod
+                    c = _mm_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm_cvttps_epi32(c);
+                    cTrunc = _mm_cvtepi32_ps(i);
+                    base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
+
+                    negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
+                    aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
+                    _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm_add_ps(indexn, fours);
+                }
+            for(n = quarterPoints * 4; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif 
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    int16_t** _result = result;
+    const unsigned int quarterPoints = num_points / 4;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m128 fours = _mm_set1_ps(4.0f);
+    const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
+    const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
+    int local_code_chip_index_;
+
+    const __m128i zeros = _mm_setzero_si128();
+    const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
+    const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
+    __m128i local_code_chip_index_reg, aux_i, negatives, i;
+    __m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
+            for(n = 0; n < quarterPoints; n++)
+                {
+                    aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm_floor_ps(aux);
+
+                    // fmod
+                    c = _mm_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm_cvttps_epi32(c);
+                    cTrunc = _mm_cvtepi32_ps(i);
+                    base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
+
+                    negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
+                    aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
+                    _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm_add_ps(indexn, fours);
+                }
+            for(n = quarterPoints * 4; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    int16_t** _result = result;
+    const unsigned int quarterPoints = num_points / 4;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m128 ones = _mm_set1_ps(1.0f);
+    const __m128 fours = _mm_set1_ps(4.0f);
+    const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
+    const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
+    int local_code_chip_index_;
+
+    const __m128i zeros = _mm_setzero_si128();
+    const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
+    const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
+    __m128i local_code_chip_index_reg, aux_i, negatives, i;
+    __m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
+            for(n = 0; n < quarterPoints; n++)
+                {
+                    aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm_add_ps(aux, aux2);
+                    // floor
+                    i = _mm_cvttps_epi32(aux);
+                    fi = _mm_cvtepi32_ps(i);
+                    igx = _mm_cmpgt_ps(fi, aux);
+                    j = _mm_and_ps(igx, ones);
+                    aux = _mm_sub_ps(fi, j);
+                    // fmod
+                    c = _mm_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm_cvttps_epi32(c);
+                    cTrunc = _mm_cvtepi32_ps(i);
+                    base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
+
+                    negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
+                    aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
+                    _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm_add_ps(indexn, fours);
+                }
+            for(n = quarterPoints * 4; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    int16_t** _result = result;
+    const unsigned int quarterPoints = num_points / 4;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m128 ones = _mm_set1_ps(1.0f);
+    const __m128 fours = _mm_set1_ps(4.0f);
+    const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
+    const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
+    int local_code_chip_index_;
+
+    const __m128i zeros = _mm_setzero_si128();
+    const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
+    const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
+    __m128i local_code_chip_index_reg, aux_i, negatives, i;
+    __m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
+            for(n = 0; n < quarterPoints; n++)
+                {
+                    aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm_add_ps(aux, aux2);
+                    // floor
+                    i = _mm_cvttps_epi32(aux);
+                    fi = _mm_cvtepi32_ps(i);
+                    igx = _mm_cmpgt_ps(fi, aux);
+                    j = _mm_and_ps(igx, ones);
+                    aux = _mm_sub_ps(fi, j);
+                    // fmod
+                    c = _mm_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm_cvttps_epi32(c);
+                    cTrunc = _mm_cvtepi32_ps(i);
+                    base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
+
+                    negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
+                    aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
+                    _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm_add_ps(indexn, fours);
+                }
+            for(n = quarterPoints * 4; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    int16_t** _result = result;
+    const unsigned int avx_iters = num_points / 8;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m256 eights = _mm256_set1_ps(8.0f);
+    const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
+    const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
+    int local_code_chip_index_;
+
+    const __m256 zeros = _mm256_setzero_ps();
+    const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
+    const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+
+    __m256i local_code_chip_index_reg, i;
+    __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < avx_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
+                    aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm256_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm256_floor_ps(aux);
+
+                    // fmod
+                    c = _mm256_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm256_cvttps_epi32(c);
+                    cTrunc = _mm256_cvtepi32_ps(i);
+                    base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(_mm256_sub_ps(aux, base));
+
+                    // no negatives
+                    c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
+                    negatives = _mm256_cmp_ps(c, zeros, 0x01 );
+                    aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
+                    aux = _mm256_add_ps(c, aux3);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
+
+                    _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 8; ++k)
+                        {
+                            _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm256_add_ps(indexn, eights);
+                }
+        }
+    _mm256_zeroupper();
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for(n = avx_iters * 8; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    int16_t** _result = result;
+    const unsigned int avx_iters = num_points / 8;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m256 eights = _mm256_set1_ps(8.0f);
+    const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
+    const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
+    int local_code_chip_index_;
+
+    const __m256 zeros = _mm256_setzero_ps();
+    const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
+    const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+
+    __m256i local_code_chip_index_reg, i;
+    __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < avx_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
+                    aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm256_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm256_floor_ps(aux);
+
+                    // fmod
+                    c = _mm256_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm256_cvttps_epi32(c);
+                    cTrunc = _mm256_cvtepi32_ps(i);
+                    base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(_mm256_sub_ps(aux, base));
+
+                    // no negatives
+                    c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
+                    negatives = _mm256_cmp_ps(c, zeros, 0x01 );
+                    aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
+                    aux = _mm256_add_ps(c, aux3);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
+
+                    _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 8; ++k)
+                        {
+                            _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm256_add_ps(indexn, eights);
+                }
+        }
+    _mm256_zeroupper();
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for(n = avx_iters * 8; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    int16_t** _result = result;
+    const unsigned int neon_iters = num_points / 4;
+    const int32x4_t ones = vdupq_n_s32(1);
+    const float32x4_t fours = vdupq_n_f32(4.0f);
+    const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
+    const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
+    int32_t local_code_chip_index_;
+
+    const int32x4_t zeros = vdupq_n_s32(0);
+    const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
+    const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
+    int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
+    float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
+    __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
+    uint32x4_t igx;
+    reciprocal = vrecpeq_f32(code_length_chips_reg_f);
+    reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
+    float32x4_t n0 = vld1q_f32((float*)vec);
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
+            aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < neon_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
+                    aux = vmulq_f32(code_phase_step_chips_reg, indexn);
+                    aux = vaddq_f32(aux, aux2);
+
+                    //floor
+                    i = vcvtq_s32_f32(aux);
+                    fi = vcvtq_f32_s32(i);
+                    igx = vcgtq_f32(fi, aux);
+                    j = vcvtq_f32_s32(vandq_s32(vreinterpretq_s32_u32(igx), ones));
+                    aux = vsubq_f32(fi, j);
+
+                    // fmod
+                    c = vmulq_f32(aux, reciprocal);
+                    i =  vcvtq_s32_f32(c);
+                    cTrunc = vcvtq_f32_s32(i);
+                    base = vmulq_f32(cTrunc, code_length_chips_reg_f);
+                    aux = vsubq_f32(aux, base);
+                    local_code_chip_index_reg = vcvtq_s32_f32(aux);
+
+                    negatives = vreinterpretq_s32_u32(vcltq_s32(local_code_chip_index_reg, zeros));
+                    aux_i = vandq_s32(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = vaddq_s32(local_code_chip_index_reg, aux_i);
+
+                    vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
+
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = vaddq_f32(indexn, fours);
+                }
+            for(n = neon_iters * 4; n < num_points; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+
+#endif
+
+
+#endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/
+
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h
new file mode 100644
index 000000000..230401ccb
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h
@@ -0,0 +1,1600 @@
+/*!
+ * \file volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h
+ * \brief VOLK_GNSSSDR kernel: multiplies N 16 bits vectors by a common vector
+ * phase rotated and accumulates the results in N 16 bits short complex outputs.
+ * \authors <ul>
+ *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
+ *          </ul>
+ *
+ * VOLK_GNSSSDR kernel that multiplies N 16 bits vectors by a common vector, which is
+ * phase-rotated by phase offset and phase increment, and accumulates the results
+ * in N 16 bits short complex outputs.
+ * It is optimized to perform the N tap correlation process in GNSS receivers.
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+/*!
+ * \page volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn
+ *
+ * \b Overview
+ *
+ * Rotates and multiplies the reference complex vector with an arbitrary number of other complex vectors,
+ * accumulates the results and stores them in the output vector.
+ * The rotation is done at a fixed rate per sample, from an initial \p phase offset.
+ * This function can be used for Doppler wipe-off and multiple correlator.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn((lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li in_common:     Pointer to one of the vectors to be rotated, multiplied and accumulated (reference vector).
+ * \li phase_inc:     Phase increment = lv_cmake(cos(phase_step_rad), sin(phase_step_rad))
+ * \li phase:         Initial phase = lv_cmake(cos(initial_phase_rad), sin(initial_phase_rad))
+ * \li in_a:          Pointer to an array of pointers to multiple vectors to be multiplied and accumulated.
+ * \li num_a_vectors: Number of vectors to be multiplied by the reference vector and accumulated.
+ * \li num_points:    Number of complex values to be multiplied together, accumulated and stored into \p result.
+ *
+ * \b Outputs
+ * \li phase:         Final phase.
+ * \li result:        Vector of \p num_a_vectors components with the multiple vectors of \p in_a rotated, multiplied by \p in_common and accumulated.
+ *
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_H
+#define INCLUDED_volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_H
+
+
+#include <volk_gnsssdr/volk_gnsssdr.h>
+#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <volk_gnsssdr/saturation_arithmetic.h>
+#include <math.h>
+//#include <stdio.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points)
+{
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+    int n_vec;
+    unsigned int n;
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            result[n_vec] = lv_cmake(0,0);
+        }
+    for (n = 0; n < num_points; n++)
+        {
+            tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+
+            // Regenerate phase
+            if (n % 256 == 0)
+                {
+                    //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+#ifdef __cplusplus
+                    (*phase) /= std::abs((*phase));
+#else
+                    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+#endif
+                    //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+                }
+
+            (*phase) *= phase_inc;
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
+                    //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
+                    result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
+                }
+        }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points)
+{
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+    int n_vec;
+    unsigned int n;
+    unsigned int j;
+    const unsigned int ROTATOR_RELOAD = 256;
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            result[n_vec] = lv_cmake(0,0);
+        }
+
+    for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
+        {
+            for (j = 0; j < ROTATOR_RELOAD; j++)
+                {
+                    tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+                    tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+                    tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+                    (*phase) *= phase_inc;
+                    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                        {
+                            lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j];
+                            //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
+                            result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
+                        }
+                }
+            /* Regenerate phase */
+#ifdef __cplusplus
+            (*phase) /= std::abs((*phase));
+#else
+            //(*phase) /= cabsf((*phase));
+            (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+#endif
+        }
+
+    for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
+        {
+            tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            (*phase) *= phase_inc;
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    lv_16sc_t tmp = tmp16 * in_a[n_vec][ (num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j ];
+                    //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
+                    result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
+                }
+        }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a,  int num_a_vectors, unsigned int num_points)
+{
+    lv_16sc_t dotProduct = lv_cmake(0,0);
+
+    const unsigned int sse_iters = num_points / 4;
+    int n_vec;
+    int i;
+    unsigned int number;
+    unsigned int n;
+    const int16_t** _in_a = in_a;
+    const lv_16sc_t* _in_common = in_common;
+    lv_16sc_t* _out = result;
+
+    __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+    __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
+
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            cacc[n_vec] = _mm_setzero_si128();
+        }
+
+    __m128i a, b, c;
+
+    // phase rotation registers
+    __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
+    __m128i pc1, pc2;
+    __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
+    two_phase_inc[0] = phase_inc * phase_inc;
+    two_phase_inc[1] = phase_inc * phase_inc;
+    two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+    __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
+    two_phase_acc[0] = (*phase);
+    two_phase_acc[1] = (*phase) * phase_inc;
+    two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
+    __m128 yl, yh, tmp1, tmp2, tmp3;
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+
+    for(number = 0; number < sse_iters; number++)
+        {
+            // Phase rotation on operand in_common starts here:
+            //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
+            pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            //next two samples
+            _in_common += 2;
+            pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            // store four rotated in_common samples in the register b
+            b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic
+
+            //next two samples
+            _in_common += 2;
+
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+
+                    a = _mm_unpacklo_epi16( a, a );
+
+                    c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+                    cacc[n_vec] = _mm_adds_epi16(cacc[n_vec], c);
+                }
+            // Regenerate phase
+            if ((number % 128) == 0)
+                {
+                    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+                    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+                    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+                    tmp2 = _mm_sqrt_ps(tmp1);
+                    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+                }
+        }
+
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+
+            a = cacc[n_vec];
+            _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
+            dotProduct = lv_cmake(0,0);
+            for (i = 0; i < 4; ++i)
+                {
+                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
+                            sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
+                }
+            _out[n_vec] = dotProduct;
+        }
+    volk_gnsssdr_free(cacc);
+
+    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+    tmp2 = _mm_sqrt_ps(tmp1);
+    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+
+    _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
+    //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
+    (*phase) = two_phase_acc[0];
+
+    for(n = sse_iters * 4; n < num_points; n++)
+        {
+            tmp16 = in_common[n];  //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            (*phase) *= phase_inc;
+
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
+                    //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
+                    _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
+                            sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+                }
+        }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+//#ifdef LV_HAVE_SSE3
+//#include <pmmintrin.h>
+
+//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a,  int num_a_vectors, unsigned int num_points)
+//{
+    //lv_16sc_t dotProduct = lv_cmake(0,0);
+
+    //const unsigned int sse_iters = num_points / 4;
+    //const unsigned int ROTATOR_RELOAD = 128;
+    //int n_vec;
+    //int i;
+    //unsigned int number;
+    //unsigned int j;
+    //unsigned int n;
+
+    //const int16_t** _in_a = in_a;
+    //const lv_16sc_t* _in_common = in_common;
+    //lv_16sc_t* _out = result;
+
+    //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+    //__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
+    //__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
+
+    //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        //{
+            //realcacc[n_vec] = _mm_setzero_si128();
+            //imagcacc[n_vec] = _mm_setzero_si128();
+        //}
+
+    //__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl;
+
+    //mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+    //mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+    //// phase rotation registers
+    //__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
+    //__m128i pc1, pc2;
+    //__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
+    //two_phase_inc[0] = phase_inc * phase_inc;
+    //two_phase_inc[1] = phase_inc * phase_inc;
+    //two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+    //__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
+    //two_phase_acc[0] = (*phase);
+    //two_phase_acc[1] = (*phase) * phase_inc;
+    //two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
+    //__m128 yl, yh, tmp1, tmp2, tmp3;
+    //lv_16sc_t tmp16;
+    //lv_32fc_t tmp32;
+
+    //for (number = 0; number <  sse_iters / ROTATOR_RELOAD; ++number)
+        //{
+            //for (j = 0; j < ROTATOR_RELOAD; j++)
+                //{
+                    //// Phase rotation on operand in_common starts here:
+                    ////printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
+                    //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+                    ////complex 32fc multiplication b=a*two_phase_acc_reg
+                    //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+                    //pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+                    ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+                    //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+                    ////next two samples
+                    //_in_common += 2;
+                    //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+                    //__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
+                    ////complex 32fc multiplication b=a*two_phase_acc_reg
+                    //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+                    //pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+                    ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+                    //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+                    //// store four rotated in_common samples in the register b
+                    //b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic
+
+                    ////next two samples
+                    //_in_common += 2;
+
+                    //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                        //{
+                            //a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+
+                            //c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+                            //c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+                            //real = _mm_subs_epi16(c, c_sr);
+
+                            //b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+                            //a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+                            //imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+                            //imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+                            //imag = _mm_adds_epi16(imag1, imag2);
+
+                            //realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
+                            //imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
+                        //}
+                //}
+            //// regenerate phase
+            //tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+            //tmp2 = _mm_hadd_ps(tmp1, tmp1);
+            //tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+            //tmp2 = _mm_sqrt_ps(tmp1);
+            //two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+        //}
+
+    //for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
+        //{
+            //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            ////complex 32fc multiplication b=a*two_phase_acc_reg
+            //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            //pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+            ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            ////next two samples
+            //_in_common += 2;
+            //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            //__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
+            ////complex 32fc multiplication b=a*two_phase_acc_reg
+            //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            //pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+            ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            //// store four rotated in_common samples in the register b
+            //b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic
+
+            ////next two samples
+            //_in_common += 2;
+
+            //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD)  * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+
+                    //c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+                    //c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+                    //real = _mm_subs_epi16(c, c_sr);
+
+                    //b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+                    //a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+                    //imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+                    //imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+                    //imag = _mm_adds_epi16(imag1, imag2);
+
+                    //realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
+                    //imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
+                //}
+        //}
+
+    //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        //{
+            //realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
+            //imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
+
+            //a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
+
+            //_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
+            //dotProduct = lv_cmake(0,0);
+            //for (i = 0; i < 4; ++i)
+                //{
+                    //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
+                            //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
+                //}
+            //_out[n_vec] = dotProduct;
+        //}
+
+    //volk_gnsssdr_free(realcacc);
+    //volk_gnsssdr_free(imagcacc);
+
+    //tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+    //tmp2 = _mm_hadd_ps(tmp1, tmp1);
+    //tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+    //tmp2 = _mm_sqrt_ps(tmp1);
+    //two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+
+    //_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
+    ////(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
+    //(*phase) = two_phase_acc[0];
+
+    //for(n = sse_iters * 4; n < num_points; n++)
+        //{
+            //tmp16 = in_common[n];  //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            //tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            //tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            //(*phase) *= phase_inc;
+
+            //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
+                    ////lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
+                    //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
+                            //sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+                //}
+        //}
+
+//}
+//#endif [> LV_HAVE_SSE3 <]
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a,  int num_a_vectors, unsigned int num_points)
+{
+    lv_16sc_t dotProduct = lv_cmake(0,0);
+
+    const unsigned int sse_iters = num_points / 4;
+    int n_vec;
+    int i;
+    unsigned int number;
+    unsigned int n;
+    const int16_t** _in_a = in_a;
+    const lv_16sc_t* _in_common = in_common;
+    lv_16sc_t* _out = result;
+
+    __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+    __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
+
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            cacc[n_vec] = _mm_setzero_si128();
+        }
+
+    __m128i a, b, c;
+
+    // phase rotation registers
+    __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
+    __m128i pc1, pc2;
+    __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
+    two_phase_inc[0] = phase_inc * phase_inc;
+    two_phase_inc[1] = phase_inc * phase_inc;
+    two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+    __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
+    two_phase_acc[0] = (*phase);
+    two_phase_acc[1] = (*phase) * phase_inc;
+    two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
+    __m128 yl, yh, tmp1, tmp2, tmp3;
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+
+    for(number = 0; number < sse_iters; number++)
+        {
+            // Phase rotation on operand in_common starts here:
+            //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
+            pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            //next two samples
+            _in_common += 2;
+            pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            // store four rotated in_common samples in the register b
+            b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic
+
+            //next two samples
+            _in_common += 2;
+
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+
+                    a = _mm_unpacklo_epi16( a, a );
+
+                    c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+                    cacc[n_vec] = _mm_adds_epi16(cacc[n_vec], c);
+                }
+            // Regenerate phase
+            if ((number % 128) == 0)
+                {
+                    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+                    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+                    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+                    tmp2 = _mm_sqrt_ps(tmp1);
+                    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+                }
+        }
+
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+
+            a = cacc[n_vec];
+            _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
+            dotProduct = lv_cmake(0,0);
+            for (i = 0; i < 4; ++i)
+                {
+                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
+                            sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
+                }
+            _out[n_vec] = dotProduct;
+        }
+    volk_gnsssdr_free(cacc);
+
+    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+    tmp2 = _mm_sqrt_ps(tmp1);
+    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+
+    _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
+    //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
+    (*phase) = two_phase_acc[0];
+
+    for(n = sse_iters * 4; n < num_points; n++)
+        {
+            tmp16 = in_common[n];  //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            (*phase) *= phase_inc;
+
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
+                    //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
+                    _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
+                            sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+                }
+        }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h>
+#include <volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h>
+
+static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a,  int num_a_vectors, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 8;
+    const int16_t** _in_a = in_a;
+    const lv_16sc_t* _in_common = in_common;
+    lv_16sc_t* _out = result;
+    int n_vec;
+    unsigned int number;
+    unsigned int n;
+
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+
+    __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+    lv_16sc_t dotProduct = lv_cmake(0,0);
+
+    __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
+
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            cacc[n_vec] = _mm256_setzero_si256();
+        }
+
+    __m128i a128, ain_128, ain_128_lo, ain_128_hi;
+    __m256i ai;
+    __m256 a, b;
+
+    __m256 four_phase_acc_reg, four_phase_inc_reg;
+
+    lv_32fc_t _phase_inc = phase_inc*phase_inc*phase_inc*phase_inc;
+
+    // Normalise the 4*phase increment
+#ifdef __cplusplus
+    _phase_inc /= std::abs(_phase_inc);
+#else
+    _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc));
+#endif
+
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4];
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4];
+    for( n = 0; n < 4; ++n )
+    {
+        four_phase_inc[n] = _phase_inc;
+        four_phase_acc[n] = *phase;
+        *phase *= phase_inc;
+    }
+    four_phase_acc_reg = _mm256_load_ps((float*) four_phase_acc);
+    four_phase_inc_reg = _mm256_load_ps((float*) four_phase_inc);
+
+    __m256i a2, b2, c, c1, c2, perm_idx;
+
+    perm_idx = _mm256_set_epi32( 7, 6, 3, 2, 5, 4, 1, 0);
+    //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7);
+
+    for(number = 0; number < avx2_iters; number++)
+        {
+            a128 = _mm_load_si128( (__m128i *)_in_common );
+            ai = _mm256_cvtepi16_epi32( a128 );
+            a = _mm256_cvtepi32_ps( ai );
+
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            b = _mm256_complexmul_ps( a, four_phase_acc_reg );
+            c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg );
+
+            //next four samples
+            _in_common += 4;
+            a128 = _mm_load_si128( (__m128i *)_in_common );
+            ai = _mm256_cvtepi16_epi32( a128 );
+            a = _mm256_cvtepi32_ps( ai );
+
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            b = _mm256_complexmul_ps( a, four_phase_acc_reg );
+            c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg );
+
+            __VOLK_GNSSSDR_PREFETCH(_in_common + 16);
+
+
+            // Store and convert 32ic to 16ic:
+            b2 = _mm256_packs_epi32( c1, c2 );
+
+            b2 = _mm256_permutevar8x32_epi32( b2, perm_idx );
+
+
+            _in_common += 4;
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    ain_128 = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 8]));
+
+                    ain_128_lo = _mm_unpacklo_epi16( ain_128, ain_128 );
+                    ain_128_hi = _mm_unpackhi_epi16( ain_128, ain_128 );
+
+                    a2 = _mm256_insertf128_si256( _mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1);
+
+                    c = _mm256_mullo_epi16(a2, b2);
+
+                    cacc[n_vec] = _mm256_adds_epi16(cacc[n_vec], c);
+                }
+            // Regenerate phase
+            if ((number % 128) == 0)
+                {
+                    four_phase_acc_reg = _mm256_complexnormalise_ps(four_phase_acc_reg);
+                }
+        }
+
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            a2 = cacc[n_vec];
+
+            _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector
+            dotProduct = lv_cmake(0,0);
+            for (number = 0; number < 8; ++number)
+                {
+                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                            sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+                }
+            _out[n_vec] = dotProduct;
+        }
+
+    volk_gnsssdr_free(cacc);
+    _mm256_zeroupper();
+
+    _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
+    (*phase) = four_phase_acc[0];
+
+    for(n = avx2_iters * 8; n < num_points; n++)
+        {
+            tmp16 = in_common[n];
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            (*phase) *= phase_inc;
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
+                    _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
+                            sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+                }
+        }
+
+}
+#endif /* LV_HAVE_AVX2 */
+
+#ifdef LV_HAVE_AVX2
+#include <immintrin.h>
+#include <volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h>
+#include <volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h>
+
+static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a,  int num_a_vectors, unsigned int num_points)
+{
+    const unsigned int avx2_iters = num_points / 8;
+    const int16_t** _in_a = in_a;
+    const lv_16sc_t* _in_common = in_common;
+    lv_16sc_t* _out = result;
+    int n_vec;
+    unsigned int number;
+    unsigned int n;
+
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+
+    __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
+    lv_16sc_t dotProduct = lv_cmake(0,0);
+
+    __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
+
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            cacc[n_vec] = _mm256_setzero_si256();
+        }
+
+    __m128i a128, ain_128, ain_128_lo, ain_128_hi;
+    __m256i ai;
+    __m256 a, b;
+
+    __m256 four_phase_acc_reg, four_phase_inc_reg;
+
+    lv_32fc_t _phase_inc = phase_inc*phase_inc*phase_inc*phase_inc;
+
+    // Normalise the 4*phase increment
+#ifdef __cplusplus
+    _phase_inc /= std::abs(_phase_inc);
+#else
+    _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc));
+#endif
+
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4];
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4];
+    for( n = 0; n < 4; ++n )
+    {
+        four_phase_inc[n] = _phase_inc;
+        four_phase_acc[n] = *phase;
+        *phase *= phase_inc;
+    }
+    four_phase_acc_reg = _mm256_load_ps((float*) four_phase_acc);
+    four_phase_inc_reg = _mm256_load_ps((float*) four_phase_inc);
+
+    __m256i a2, b2, c, c1, c2, perm_idx;
+
+    perm_idx = _mm256_set_epi32( 7, 6, 3, 2, 5, 4, 1, 0);
+    //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7);
+
+    for(number = 0; number < avx2_iters; number++)
+        {
+            a128 = _mm_loadu_si128( (__m128i *)_in_common );
+            ai = _mm256_cvtepi16_epi32( a128 );
+            a = _mm256_cvtepi32_ps( ai );
+
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            b = _mm256_complexmul_ps( a, four_phase_acc_reg );
+            c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg );
+
+            //next four samples
+            _in_common += 4;
+            a128 = _mm_loadu_si128( (__m128i *)_in_common );
+            ai = _mm256_cvtepi16_epi32( a128 );
+            a = _mm256_cvtepi32_ps( ai );
+
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            b = _mm256_complexmul_ps( a, four_phase_acc_reg );
+            c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg );
+
+            __VOLK_GNSSSDR_PREFETCH(_in_common + 16);
+
+
+            // Store and convert 32ic to 16ic:
+            b2 = _mm256_packs_epi32( c1, c2 );
+
+            b2 = _mm256_permutevar8x32_epi32( b2, perm_idx );
+
+
+            _in_common += 4;
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    ain_128 = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 8]));
+
+                    ain_128_lo = _mm_unpacklo_epi16( ain_128, ain_128 );
+                    ain_128_hi = _mm_unpackhi_epi16( ain_128, ain_128 );
+
+                    a2 = _mm256_insertf128_si256( _mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1);
+
+                    c = _mm256_mullo_epi16(a2, b2);
+
+                    cacc[n_vec] = _mm256_adds_epi16(cacc[n_vec], c);
+                }
+            // Regenerate phase
+            if ((number % 128) == 0)
+                {
+                    four_phase_acc_reg = _mm256_complexnormalise_ps(four_phase_acc_reg);
+                }
+        }
+
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            a2 = cacc[n_vec];
+
+            _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector
+            dotProduct = lv_cmake(0,0);
+            for (number = 0; number < 8; ++number)
+                {
+                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])),
+                            sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number])));
+                }
+            _out[n_vec] = dotProduct;
+        }
+
+    volk_gnsssdr_free(cacc);
+    _mm256_zeroupper();
+
+    _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
+    (*phase) = four_phase_acc[0];
+
+    for(n = avx2_iters * 8; n < num_points; n++)
+        {
+            tmp16 = in_common[n];
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            (*phase) *= phase_inc;
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
+                    _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
+                            sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+                }
+        }
+
+}
+#endif /* LV_HAVE_AVX2 */
+
+//#ifdef LV_HAVE_NEON
+//#include <arm_neon.h>
+
+//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a,  int num_a_vectors, unsigned int num_points)
+//{
+    //const unsigned int neon_iters = num_points / 4;
+
+    //const int16_t** _in_a = in_a;
+    //const lv_16sc_t* _in_common = in_common;
+    //lv_16sc_t* _out = result;
+    //int n_vec;
+    //int i;
+    //unsigned int number;
+    //unsigned int n;
+    //lv_16sc_t tmp16_, tmp;
+    //lv_32fc_t tmp32_;
+
+    //if (neon_iters > 0)
+        //{
+            //lv_16sc_t dotProduct = lv_cmake(0,0);
+            //float arg_phase0 = cargf(*phase);
+            //float arg_phase_inc = cargf(phase_inc);
+            //float phase_est;
+
+            //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
+
+            //float32x4_t _phase4_real = vld1q_f32(__phase4_real);
+            //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
+
+            //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc;
+            //lv_32fc_t phase3 = phase2 * phase_inc;
+            //lv_32fc_t phase4 = phase3 * phase_inc;
+
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+            //float32x4_t _phase_real = vld1q_f32(__phase_real);
+            //float32x4_t _phase_imag = vld1q_f32(__phase_imag);
+
+            //int16x4x2_t a_val, b_val, c_val;
+            //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+            //float32x4_t half = vdupq_n_f32(0.5f);
+            //int16x4x2_t tmp16;
+            //int32x4x2_t tmp32i;
+
+            //float32x4x2_t tmp32f, tmp32_real, tmp32_imag;
+            //float32x4_t sign, PlusHalf, Round;
+
+            //int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
+
+            //for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //accumulator[n_vec].val[0] = vdup_n_s16(0);
+                    //accumulator[n_vec].val[1] = vdup_n_s16(0);
+                //}
+
+            //for(number = 0; number < neon_iters; number++)
+                //{
+                    //[> load 4 complex numbers (int 16 bits each component) <]
+                    //tmp16 = vld2_s16((int16_t*)_in_common);
+                    //__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
+                    //_in_common += 4;
+
+                    //[> promote them to int 32 bits <]
+                    //tmp32i.val[0] = vmovl_s16(tmp16.val[0]);
+                    //tmp32i.val[1] = vmovl_s16(tmp16.val[1]);
+
+                    //[> promote them to float 32 bits <]
+                    //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]);
+                    //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]);
+
+                    //[> complex multiplication of four complex samples (float 32 bits each component) <]
+                    //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real);
+                    //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag);
+                    //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag);
+                    //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real);
+
+                    //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
+                    //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);
+
+                    //[> downcast results to int32 <]
+                    //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <]
+                    //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31)));
+                    //PlusHalf = vaddq_f32(tmp32f.val[0], half);
+                    //Round = vsubq_f32(PlusHalf, sign);
+                    //tmp32i.val[0] = vcvtq_s32_f32(Round);
+
+                    //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31)));
+                    //PlusHalf = vaddq_f32(tmp32f.val[1], half);
+                    //Round = vsubq_f32(PlusHalf, sign);
+                    //tmp32i.val[1] = vcvtq_s32_f32(Round);
+
+                    //[> downcast results to int16 <]
+                    //tmp16.val[0] = vqmovn_s32(tmp32i.val[0]);
+                    //tmp16.val[1] = vqmovn_s32(tmp32i.val[1]);
+
+                    //[> compute next four phases <]
+                    //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real);
+                    //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag);
+                    //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag);
+                    //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real);
+
+                    //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
+                    //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);
+
+                    //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                        //{
+                            //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+                            ////__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8);
+
+                            //// multiply the real*real and imag*imag to get real result
+                            //// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+                            //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]);
+                            //// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+                            //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[1]);
+                            //c_val.val[0] = vqsub_s16(b_val.val[0], b_val.val[1]);
+
+                            //// Multiply cross terms to get the imaginary result
+                            //// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+                            //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[1]);
+                            //// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+                            //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]);
+                            //c_val.val[1] = vqadd_s16(b_val.val[0], b_val.val[1]);
+
+                            //accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], c_val.val[0]);
+                            //accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], c_val.val[1]);
+                        //}
+                    //// Regenerate phase
+                    //if ((number % 256) == 0)
+                        //{
+                            //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
+
+                            //*phase = lv_cmake(cos(phase_est), sin(phase_est));
+                            //phase2 = (lv_32fc_t)(*phase) * phase_inc;
+                            //phase3 = phase2 * phase_inc;
+                            //phase4 = phase3 * phase_inc;
+
+                            //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+                            //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+                            //_phase_real = vld1q_f32(____phase_real);
+                            //_phase_imag = vld1q_f32(____phase_imag);
+                        //}
+                //}
+
+            //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
+                    //dotProduct = lv_cmake(0,0);
+                    //for (i = 0; i < 4; ++i)
+                        //{
+                            //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
+                                    //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
+                        //}
+                    //_out[n_vec] = dotProduct;
+                //}
+            //volk_gnsssdr_free(accumulator);
+            //vst1q_f32((float32_t*)__phase_real, _phase_real);
+            //vst1q_f32((float32_t*)__phase_imag, _phase_imag);
+
+            //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
+        //}
+
+    //for (n = neon_iters * 4; n < num_points; n++)
+        //{
+            //tmp16_ = in_common[n];  //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
+            //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
+            //(*phase) *= phase_inc;
+            //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //tmp = tmp16_ * in_a[n_vec][n];
+                    //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+                //}
+        //}
+//}
+
+//#endif [> LV_HAVE_NEON <]
+
+
+//#ifdef LV_HAVE_NEON
+//#include <arm_neon.h>
+//#include <volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h>
+
+//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a,  int num_a_vectors, unsigned int num_points)
+//{
+    //const unsigned int neon_iters = num_points / 4;
+
+    //const int16_t** _in_a = in_a;
+    //const lv_16sc_t* _in_common = in_common;
+    //lv_16sc_t* _out = result;
+    //int n_vec;
+    //int i;
+    //unsigned int number;
+    //unsigned int n;
+    //lv_16sc_t tmp16_, tmp;
+    //lv_32fc_t tmp32_;
+
+    //if (neon_iters > 0)
+        //{
+            //lv_16sc_t dotProduct = lv_cmake(0,0);
+            //float arg_phase0 = cargf(*phase);
+            //float arg_phase_inc = cargf(phase_inc);
+            //float phase_est;
+            ////printf("arg phase0: %f", arg_phase0);
+            //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
+
+            //float32x4_t _phase4_real = vld1q_f32(__phase4_real);
+            //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
+
+            //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc;
+            //lv_32fc_t phase3 = phase2 * phase_inc;
+            //lv_32fc_t phase4 = phase3 * phase_inc;
+
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+            //float32x4_t _phase_real = vld1q_f32(__phase_real);
+            //float32x4_t _phase_imag = vld1q_f32(__phase_imag);
+
+            //int16x4x2_t a_val, b_val;
+            //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+            //float32x4_t half = vdupq_n_f32(0.5f);
+            //int16x4x2_t tmp16;
+            //int32x4x2_t tmp32i;
+
+            //float32x4x2_t tmp32f, tmp32_real, tmp32_imag;
+            //float32x4_t sign, PlusHalf, Round;
+
+            //int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
+
+            //for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //accumulator[n_vec].val[0] = vdup_n_s16(0);
+                    //accumulator[n_vec].val[1] = vdup_n_s16(0);
+                //}
+
+            //for(number = 0; number < neon_iters; number++)
+                //{
+                    //[> load 4 complex numbers (int 16 bits each component) <]
+                    //tmp16 = vld2_s16((int16_t*)_in_common);
+                    //__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
+                    //_in_common += 4;
+
+                    //[> promote them to int 32 bits <]
+                    //tmp32i.val[0] = vmovl_s16(tmp16.val[0]);
+                    //tmp32i.val[1] = vmovl_s16(tmp16.val[1]);
+
+                    //[> promote them to float 32 bits <]
+                    //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]);
+                    //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]);
+
+                    //[> complex multiplication of four complex samples (float 32 bits each component) <]
+                    //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real);
+                    //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag);
+                    //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag);
+                    //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real);
+
+                    //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
+                    //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);
+
+                    //[> downcast results to int32 <]
+                    //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <]
+                    //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31)));
+                    //PlusHalf = vaddq_f32(tmp32f.val[0], half);
+                    //Round = vsubq_f32(PlusHalf, sign);
+                    //tmp32i.val[0] = vcvtq_s32_f32(Round);
+
+                    //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31)));
+                    //PlusHalf = vaddq_f32(tmp32f.val[1], half);
+                    //Round = vsubq_f32(PlusHalf, sign);
+                    //tmp32i.val[1] = vcvtq_s32_f32(Round);
+
+                    //[> downcast results to int16 <]
+                    //tmp16.val[0] = vqmovn_s32(tmp32i.val[0]);
+                    //tmp16.val[1] = vqmovn_s32(tmp32i.val[1]);
+
+                    //[> compute next four phases <]
+                    //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real);
+                    //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag);
+                    //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag);
+                    //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real);
+
+                    //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
+                    //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);
+
+                    //// Regenerate phase
+                    //if ((number % 256) == 0)
+                        //{
+                            ////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
+                            //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
+                            ////printf("Estimated phase: %f\n\n", cos(phase_est));
+
+                            //*phase = lv_cmake(cos(phase_est), sin(phase_est));
+                            //phase2 = (lv_32fc_t)(*phase) * phase_inc;
+                            //phase3 = phase2 * phase_inc;
+                            //phase4 = phase3 * phase_inc;
+
+                            //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+                            //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+                            //_phase_real = vld1q_f32(____phase_real);
+                            //_phase_imag = vld1q_f32(____phase_imag);
+
+                            //// Round = vmulq_f32(_phase_real, _phase_real);
+                            //// Round = vmlaq_f32(Round, _phase_imag, _phase_imag);
+                            ////               Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]);
+                            ////Round = vrsqrteq_f32(Round);printf("1/sqtr: %f  \n",Round[0]);  
+                            ////Round = vrecpeq_f32((Round);
+                            ////              _phase_real = vdivq_f32(_phase_real, Round);
+                            ////              _phase_imag = vdivq_f32(_phase_imag, Round);
+                            ////_phase_real = vmulq_f32(_phase_real, Round);
+                            ////_phase_imag = vmulq_f32(_phase_imag, Round);
+                            ////printf("After  %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0]));
+
+                        //}
+
+                    //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                        //{
+                            //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4]));
+
+                            //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]);
+                            //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]);
+
+                            //// use multiply accumulate/subtract to get result
+                            //b_val.val[0] = vmls_s16(b_val.val[0], a_val.val[1], tmp16.val[1]);
+                            //b_val.val[1] = vmla_s16(b_val.val[1], a_val.val[0], tmp16.val[1]);
+
+                            //accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], b_val.val[0]);
+                            //accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], b_val.val[1]);
+                        //}
+                //}
+
+            //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
+                    //dotProduct = lv_cmake(0,0);
+                    //for (i = 0; i < 4; ++i)
+                        //{
+                            //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
+                                    //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
+                        //}
+                    //_out[n_vec] = dotProduct;
+                //}
+            //volk_gnsssdr_free(accumulator);
+
+            //vst1q_f32((float32_t*)__phase_real, _phase_real);
+            //vst1q_f32((float32_t*)__phase_imag, _phase_imag);
+
+            //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
+        //}
+
+    //for (n = neon_iters * 4; n < num_points; n++)
+        //{
+            //tmp16_ = in_common[n];  //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
+            //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
+            //(*phase) *= phase_inc;
+            //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //tmp = tmp16_ * in_a[n_vec][n];
+                    //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+                //}
+        //}
+//}
+
+//#endif [> LV_HAVE_NEON <]
+
+
+//#ifdef LV_HAVE_NEON
+//#include <arm_neon.h>
+//#include <volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h>
+
+//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a,  int num_a_vectors, unsigned int num_points)
+//{
+    //const unsigned int neon_iters = num_points / 4;
+
+    //const int16_t** _in_a = in_a;
+    //const lv_16sc_t* _in_common = in_common;
+    //lv_16sc_t* _out = result;
+    //int n_vec;
+    //int i;
+    //unsigned int number;
+    //unsigned int n;
+    //lv_16sc_t tmp16_, tmp;
+    //lv_32fc_t tmp32_;
+
+    //if (neon_iters > 0)
+        //{
+            //lv_16sc_t dotProduct = lv_cmake(0,0);
+            //float arg_phase0 = cargf(*phase);
+            //float arg_phase_inc = cargf(phase_inc);
+            //float phase_est;
+
+            //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
+
+            //float32x4_t _phase4_real = vld1q_f32(__phase4_real);
+            //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
+
+            //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc;
+            //lv_32fc_t phase3 = phase2 * phase_inc;
+            //lv_32fc_t phase4 = phase3 * phase_inc;
+
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+            //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+            //float32x4_t _phase_real = vld1q_f32(__phase_real);
+            //float32x4_t _phase_imag = vld1q_f32(__phase_imag);
+
+            //int16x4x2_t a_val, b_val;
+            //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+            //float32x4_t half = vdupq_n_f32(0.5f);
+            //int32x4x2_t tmp32i;
+
+            //float32x4x2_t tmp32f, tmp32_real, tmp32_imag;
+            //float32x4_t sign, PlusHalf, Round;
+
+            //int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
+            //int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
+
+            //for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //accumulator1[n_vec].val[0] = vdup_n_s16(0);
+                    //accumulator1[n_vec].val[1] = vdup_n_s16(0);
+                    //accumulator2[n_vec].val[0] = vdup_n_s16(0);
+                    //accumulator2[n_vec].val[1] = vdup_n_s16(0);
+                //}
+
+            //for(number = 0; number < neon_iters; number++)
+                //{
+                    //[> load 4 complex numbers (int 16 bits each component) <]
+                    //b_val = vld2_s16((int16_t*)_in_common);
+                    //__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
+                    //_in_common += 4;
+
+                    //[> promote them to int 32 bits <]
+                    //tmp32i.val[0] = vmovl_s16(b_val.val[0]);
+                    //tmp32i.val[1] = vmovl_s16(b_val.val[1]);
+
+                    //[> promote them to float 32 bits <]
+                    //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]);
+                    //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]);
+
+                    //[> complex multiplication of four complex samples (float 32 bits each component) <]
+                    //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real);
+                    //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag);
+                    //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag);
+                    //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real);
+
+                    //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
+                    //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);
+
+                    //[> downcast results to int32 <]
+                    //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <]
+                    //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31)));
+                    //PlusHalf = vaddq_f32(tmp32f.val[0], half);
+                    //Round = vsubq_f32(PlusHalf, sign);
+                    //tmp32i.val[0] = vcvtq_s32_f32(Round);
+
+                    //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31)));
+                    //PlusHalf = vaddq_f32(tmp32f.val[1], half);
+                    //Round = vsubq_f32(PlusHalf, sign);
+                    //tmp32i.val[1] = vcvtq_s32_f32(Round);
+
+                    //[> downcast results to int16 <]
+                    //b_val.val[0] = vqmovn_s32(tmp32i.val[0]);
+                    //b_val.val[1] = vqmovn_s32(tmp32i.val[1]);
+
+                    //[> compute next four phases <]
+                    //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real);
+                    //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag);
+                    //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag);
+                    //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real);
+
+                    //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
+                    //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);
+
+                    //// Regenerate phase
+                    //if ((number % 256) == 0)
+                        //{
+                            ////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
+                            //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
+                            ////printf("Estimated phase: %f\n\n", cos(phase_est));
+
+                            //*phase = lv_cmake(cos(phase_est), sin(phase_est));
+                            //phase2 = (lv_32fc_t)(*phase) * phase_inc;
+                            //phase3 = phase2 * phase_inc;
+                            //phase4 = phase3 * phase_inc;
+
+                            //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+                            //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+                            //_phase_real = vld1q_f32(____phase_real);
+                            //_phase_imag = vld1q_f32(____phase_imag);
+                        //}
+
+                    //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                        //{
+                            //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4]));
+
+                            //// use 2 accumulators to remove inter-instruction data dependencies
+                            //accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]);
+                            //accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]);
+                            //accumulator2[n_vec].val[0] = vmls_s16(accumulator2[n_vec].val[0], a_val.val[1], b_val.val[1]);
+                            //accumulator2[n_vec].val[1] = vmla_s16(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]);
+                        //}
+                //}
+            //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //accumulator1[n_vec].val[0] = vqadd_s16(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]);
+                    //accumulator1[n_vec].val[1] = vqadd_s16(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]);
+                //}
+            //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
+                    //dotProduct = lv_cmake(0,0);
+                    //for (i = 0; i < 4; ++i)
+                        //{
+                            //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
+                                    //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
+                        //}
+                    //_out[n_vec] = dotProduct;
+                //}
+            //volk_gnsssdr_free(accumulator1);
+            //volk_gnsssdr_free(accumulator2);
+
+            //vst1q_f32((float32_t*)__phase_real, _phase_real);
+            //vst1q_f32((float32_t*)__phase_imag, _phase_imag);
+
+            //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
+        //}
+
+    //for (n = neon_iters * 4; n < num_points; n++)
+        //{
+            //tmp16_ = in_common[n];  //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
+            //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
+            //(*phase) *= phase_inc;
+            //for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                //{
+                    //tmp = tmp16_ * in_a[n_vec][n];
+                    //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+                //}
+        //}
+//}
+
+//#endif [> LV_HAVE_NEON <]
+
+#endif /*INCLUDED_volk_gnsssdr_16ic_16i_dot_prod_16ic_xn_H*/
+
+
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h
new file mode 100644
index 000000000..a666c0270
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h
@@ -0,0 +1,384 @@
+/*!
+ * \file volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h
+ * \brief Volk puppet for the multiple 16-bit complex dot product kernel.
+ * \authors <ul>
+ *          <li> Carles Fernandez Prades 2016 cfernandez at cttc dot cat
+ *          </ul>
+ *
+ * Volk puppet for integrating the resampler into volk's test system
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H
+#define INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H
+
+#include "volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h"
+#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
+#include <volk_gnsssdr/volk_gnsssdr.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code,  const lv_16sc_t* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    unsigned int n;
+    int num_a_vectors = 3;
+    int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        }
+    volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points);
+
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif  // Generic
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code,  const lv_16sc_t* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    unsigned int n;
+    int num_a_vectors = 3;
+    int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        }
+    volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points);
+
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif  // Generic
+
+
+#ifdef LV_HAVE_SSE3
+static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    unsigned int n;
+    int num_a_vectors = 3;
+    int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        }
+
+    volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
+
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif // SSE3
+
+
+//#ifdef LV_HAVE_SSE3
+//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+//{
+    //// phases must be normalized. Phase rotator expects a complex exponential input!
+    //float rem_carrier_phase_in_rad = 0.345;
+    //float phase_step_rad = 0.1;
+    //lv_32fc_t phase[1];
+    //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    //lv_32fc_t phase_inc[1];
+    //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    //unsigned int n;
+    //int num_a_vectors = 3;
+    //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        //}
+
+    //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
+
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //volk_gnsssdr_free(in_a[n]);
+        //}
+    //volk_gnsssdr_free(in_a);
+//}
+
+//#endif // SSE3
+
+
+#ifdef LV_HAVE_SSE3
+static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    unsigned int n;
+    int num_a_vectors = 3;
+    int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        }
+
+    volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
+
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif // SSE3
+
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    unsigned int n;
+    int num_a_vectors = 3;
+    int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        }
+
+    volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
+
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif // AVX2
+
+
+//#ifdef LV_HAVE_AVX2
+//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+//{
+    //// phases must be normalized. Phase rotator expects a complex exponential input!
+    //float rem_carrier_phase_in_rad = 0.345;
+    //float phase_step_rad = 0.1;
+    //lv_32fc_t phase[1];
+    //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    //lv_32fc_t phase_inc[1];
+    //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    //unsigned int n;
+    //int num_a_vectors = 3;
+    //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        //}
+
+    //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
+
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //volk_gnsssdr_free(in_a[n]);
+        //}
+    //volk_gnsssdr_free(in_a);
+//}
+
+//#endif // AVX2
+
+
+#ifdef LV_HAVE_AVX2
+static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    unsigned int n;
+    int num_a_vectors = 3;
+    int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        }
+
+    volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
+
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif // AVX2
+
+
+//#ifdef LV_HAVE_AVX2
+//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+//{
+    //// phases must be normalized. Phase rotator expects a complex exponential input!
+    //float rem_carrier_phase_in_rad = 0.345;
+    //float phase_step_rad = 0.1;
+    //lv_32fc_t phase[1];
+    //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    //lv_32fc_t phase_inc[1];
+    //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    //unsigned int n;
+    //int num_a_vectors = 3;
+    //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        //}
+
+    //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
+
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //volk_gnsssdr_free(in_a[n]);
+        //}
+    //volk_gnsssdr_free(in_a);
+//}
+
+//#endif // AVX2
+
+
+//#ifdef LV_HAVE_NEON
+//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+//{
+    //// phases must be normalized. Phase rotator expects a complex exponential input!
+    //float rem_carrier_phase_in_rad = 0.345;
+    //float phase_step_rad = 0.1;
+    //lv_32fc_t phase[1];
+    //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    //lv_32fc_t phase_inc[1];
+    //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    //unsigned int n;
+    //int num_a_vectors = 3;
+    //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        //}
+
+    //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
+
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //volk_gnsssdr_free(in_a[n]);
+        //}
+    //volk_gnsssdr_free(in_a);
+//}
+
+//#endif // NEON
+
+
+//#ifdef LV_HAVE_NEON
+//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+//{
+    //// phases must be normalized. Phase rotator expects a complex exponential input!
+    //float rem_carrier_phase_in_rad = 0.345;
+    //float phase_step_rad = 0.1;
+    //lv_32fc_t phase[1];
+    //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    //lv_32fc_t phase_inc[1];
+    //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    //unsigned int n;
+    //int num_a_vectors = 3;
+    //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
+            //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
+        //}
+
+    //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
+
+    //for(n = 0; n < num_a_vectors; n++)
+        //{
+            //volk_gnsssdr_free(in_a[n]);
+        //}
+    //volk_gnsssdr_free(in_a);
+//}
+
+//#endif // NEON
+
+#endif  // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H
+
+
+
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h
index 106ad85b7..85e6fcb08 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h
@@ -44,8 +44,8 @@
 #ifdef LV_HAVE_GENERIC
 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
     int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     unsigned int n;
     float rem_code_phase_chips = -0.234;
@@ -74,8 +74,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* r
 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
     int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -103,8 +103,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* re
 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
     int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -133,8 +133,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* re
 #ifdef LV_HAVE_SSE4_1
 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
     int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -163,8 +163,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t*
 #ifdef LV_HAVE_SSE4_1
 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
     int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -193,8 +193,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t*
 #ifdef LV_HAVE_AVX
 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
     int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -223,8 +223,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* res
 #ifdef LV_HAVE_AVX
 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
     int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -253,8 +253,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res
 #ifdef LV_HAVE_NEON
 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
     int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h
new file mode 100644
index 000000000..cf2a80f52
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h
@@ -0,0 +1,279 @@
+/*!
+ * \file volk_gnsssdr_32f_resamplerxnpuppet_32f.h
+ * \brief VOLK_GNSSSDR puppet for the multiple 32-bit float vector resampler kernel.
+ * \authors <ul>
+ *          <li> Cillian O'Driscoll 2017 cillian.odriscoll at gmail dot com
+ *          </ul>
+ *
+ * VOLK_GNSSSDR puppet for integrating the multiple resampler into the test system
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2017  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_32f_resamplerxnpuppet_32f_H
+#define INCLUDED_volk_gnsssdr_32f_resamplerxnpuppet_32f_H
+
+#include "volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h"
+#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <volk_gnsssdr/volk_gnsssdr.h>
+#include <string.h>
+
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1  };
+
+    float** result_aux =  (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32f_xn_resampler_32f_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+
+#endif /* LV_HAVE_GENERIC */
+
+#ifdef LV_HAVE_SSE3
+static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, const float* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    float** result_aux =  (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+#ifdef LV_HAVE_SSE3
+static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, const float* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    float** result_aux =  (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+
+#ifdef LV_HAVE_SSE4_1
+static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result, const float* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    float** result_aux =  (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+#ifdef LV_HAVE_SSE4_1
+static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result, const float* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    float** result_aux =  (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+
+#endif
+
+#ifdef LV_HAVE_AVX
+static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, const float* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    float** result_aux =  (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+#endif
+
+
+#ifdef LV_HAVE_AVX
+static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, const float* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    float** result_aux =  (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+#endif
+
+#ifdef LV_HAVE_NEON
+static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points)
+{
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
+    int num_out_vectors = 3;
+    float rem_code_phase_chips = -0.234;
+    unsigned int n;
+    float shifts_chips[3] = { -0.1, 0.0, 0.1 };
+
+    float** result_aux =  (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_out_vectors; n++)
+    {
+       result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
+    }
+
+    volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
+
+    memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
+
+    for(n = 0; n < num_out_vectors; n++)
+    {
+        volk_gnsssdr_free(result_aux[n]);
+    }
+    volk_gnsssdr_free(result_aux);
+}
+#endif
+
+#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H
+
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h
new file mode 100644
index 000000000..1fa95e0e6
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h
@@ -0,0 +1,610 @@
+/*!
+ * \file volk_gnsssdr_32f_xn_resampler_32f_xn.h
+ * \brief VOLK_GNSSSDR kernel: Resamples N complex 32-bit float vectors using zero hold resample algorithm.
+ * \authors <ul>
+ *          <li> Cillian O'Driscoll, 2017. cillian.odirscoll(at)gmail.com
+ *          </ul>
+ *
+ * VOLK_GNSSSDR kernel that resamples N 32-bit float vectors using zero hold resample algorithm.
+ * It is optimized to resample a single GNSS local code signal replica into N vectors fractional-resampled and fractional-delayed
+ * (i.e. it creates the Early, Prompt, and Late code replicas)
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2017  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+/*!
+ * \page volk_gnsssdr_32f_xn_resampler_32f_xn
+ *
+ * \b Overview
+ *
+ * Resamples a 32-bit floating point vector , providing \p num_out_vectors outputs.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_gnsssdr_32f_xn_resampler_32f_xn(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+ * \endcode
+ *
+ * \b Inputs
+ * \li local_code:            Vector to be resampled.
+ * \li rem_code_phase_chips:  Remnant code phase [chips].
+ * \li code_phase_step_chips: Phase increment per sample [chips/sample].
+ * \li shifts_chips:          Vector of floats that defines the spacing (in chips) between the replicas of \p local_code
+ * \li code_length_chips:     Code length in chips.
+ * \li num_out_vectors        Number of output vectors.
+ * \li num_points:            The number of data values to be in the resampled vector.
+ *
+ * \b Outputs
+ * \li result:                Pointer to a vector of pointers where the results will be stored.
+ *
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H
+#define INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h> /* abs */
+#include <stdint.h> /* int64_t */
+#include <stdio.h>
+#include <volk_gnsssdr/volk_gnsssdr_common.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    int local_code_chip_index;
+    int current_correlator_tap;
+    int n;
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for (n = 0; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1);
+                    local_code_chip_index = local_code_chip_index % code_length_chips;
+                    result[current_correlator_tap][n] = local_code[local_code_chip_index];
+                }
+        }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    float** _result = result;
+    const unsigned int quarterPoints = num_points / 4;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m128 ones = _mm_set1_ps(1.0f);
+    const __m128 fours = _mm_set1_ps(4.0f);
+    const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
+    const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
+    int local_code_chip_index_;
+
+    const __m128i zeros = _mm_setzero_si128();
+    const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
+    const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
+    __m128i local_code_chip_index_reg, aux_i, negatives, i;
+    __m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
+            for(n = 0; n < quarterPoints; n++)
+                {
+                    aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm_add_ps(aux, aux2);
+                    // floor
+                    i = _mm_cvttps_epi32(aux);
+                    fi = _mm_cvtepi32_ps(i);
+                    igx = _mm_cmpgt_ps(fi, aux);
+                    j = _mm_and_ps(igx, ones);
+                    aux = _mm_sub_ps(fi, j);
+                    // fmod
+                    c = _mm_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm_cvttps_epi32(c);
+                    cTrunc = _mm_cvtepi32_ps(i);
+                    base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
+
+                    negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
+                    aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
+                    _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm_add_ps(indexn, fours);
+                }
+            for(n = quarterPoints * 4; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif 
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    float** _result = result;
+    const unsigned int quarterPoints = num_points / 4;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m128 ones = _mm_set1_ps(1.0f);
+    const __m128 fours = _mm_set1_ps(4.0f);
+    const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
+    const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
+    int local_code_chip_index_;
+
+    const __m128i zeros = _mm_setzero_si128();
+    const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
+    const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
+    __m128i local_code_chip_index_reg, aux_i, negatives, i;
+    __m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
+            for(n = 0; n < quarterPoints; n++)
+                {
+                    aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm_add_ps(aux, aux2);
+                    // floor
+                    i = _mm_cvttps_epi32(aux);
+                    fi = _mm_cvtepi32_ps(i);
+                    igx = _mm_cmpgt_ps(fi, aux);
+                    j = _mm_and_ps(igx, ones);
+                    aux = _mm_sub_ps(fi, j);
+                    // fmod
+                    c = _mm_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm_cvttps_epi32(c);
+                    cTrunc = _mm_cvtepi32_ps(i);
+                    base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
+
+                    negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
+                    aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
+                    _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm_add_ps(indexn, fours);
+                }
+            for(n = quarterPoints * 4; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+#endif
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    float** _result = result;
+    const unsigned int quarterPoints = num_points / 4;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m128 fours = _mm_set1_ps(4.0f);
+    const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
+    const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
+    int local_code_chip_index_;
+
+    const __m128i zeros = _mm_setzero_si128();
+    const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
+    const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
+    __m128i local_code_chip_index_reg, aux_i, negatives, i;
+    __m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
+            for(n = 0; n < quarterPoints; n++)
+                {
+                    aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm_floor_ps(aux);
+
+                    // fmod
+                    c = _mm_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm_cvttps_epi32(c);
+                    cTrunc = _mm_cvtepi32_ps(i);
+                    base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
+
+                    negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
+                    aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
+                    _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm_add_ps(indexn, fours);
+                }
+            for(n = quarterPoints * 4; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif 
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    float** _result = result;
+    const unsigned int quarterPoints = num_points / 4;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m128 fours = _mm_set1_ps(4.0f);
+    const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
+    const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
+    int local_code_chip_index_;
+
+    const __m128i zeros = _mm_setzero_si128();
+    const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
+    const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
+    __m128i local_code_chip_index_reg, aux_i, negatives, i;
+    __m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
+            for(n = 0; n < quarterPoints; n++)
+                {
+                    aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm_floor_ps(aux);
+
+                    // fmod
+                    c = _mm_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm_cvttps_epi32(c);
+                    cTrunc = _mm_cvtepi32_ps(i);
+                    base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
+
+                    negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
+                    aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
+                    _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm_add_ps(indexn, fours);
+                }
+            for(n = quarterPoints * 4; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    float** _result = result;
+    const unsigned int avx_iters = num_points / 8;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m256 eights = _mm256_set1_ps(8.0f);
+    const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
+    const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
+    int local_code_chip_index_;
+
+    const __m256 zeros = _mm256_setzero_ps();
+    const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
+    const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+
+    __m256i local_code_chip_index_reg, i;
+    __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < avx_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
+                    aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm256_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm256_floor_ps(aux);
+
+                    // fmod
+                    c = _mm256_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm256_cvttps_epi32(c);
+                    cTrunc = _mm256_cvtepi32_ps(i);
+                    base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(_mm256_sub_ps(aux, base));
+
+                    // no negatives
+                    c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
+                    negatives = _mm256_cmp_ps(c, zeros, 0x01 );
+                    aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
+                    aux = _mm256_add_ps(c, aux3);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
+
+                    _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 8; ++k)
+                        {
+                            _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm256_add_ps(indexn, eights);
+                }
+        }
+    _mm256_zeroupper();
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for(n = avx_iters * 8; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    float** _result = result;
+    const unsigned int avx_iters = num_points / 8;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const __m256 eights = _mm256_set1_ps(8.0f);
+    const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
+    const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
+    int local_code_chip_index_;
+
+    const __m256 zeros = _mm256_setzero_ps();
+    const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
+    const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
+
+    __m256i local_code_chip_index_reg, i;
+    __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
+            aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < avx_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
+                    aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
+                    aux = _mm256_add_ps(aux, aux2);
+                    // floor
+                    aux = _mm256_floor_ps(aux);
+
+                    // fmod
+                    c = _mm256_div_ps(aux, code_length_chips_reg_f);
+                    i = _mm256_cvttps_epi32(c);
+                    cTrunc = _mm256_cvtepi32_ps(i);
+                    base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(_mm256_sub_ps(aux, base));
+
+                    // no negatives
+                    c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
+                    negatives = _mm256_cmp_ps(c, zeros, 0x01 );
+                    aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
+                    aux = _mm256_add_ps(c, aux3);
+                    local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
+
+                    _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
+                    for(k = 0; k < 8; ++k)
+                        {
+                            _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = _mm256_add_ps(indexn, eights);
+                }
+        }
+    _mm256_zeroupper();
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            for(n = avx_iters * 8; n < num_points; n++)
+                {
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
+{
+    float** _result = result;
+    const unsigned int neon_iters = num_points / 4;
+    int current_correlator_tap;
+    unsigned int n;
+    unsigned int k;
+    const int32x4_t ones = vdupq_n_s32(1);
+    const float32x4_t fours = vdupq_n_f32(4.0f);
+    const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
+    const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
+
+    __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
+    int32_t local_code_chip_index_;
+
+    const int32x4_t zeros = vdupq_n_s32(0);
+    const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
+    const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
+    int32x4_t local_code_chip_index_reg, aux_i,  negatives, i;
+    float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
+    __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
+    uint32x4_t igx;
+    reciprocal = vrecpeq_f32(code_length_chips_reg_f);
+    reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
+    reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
+    float32x4_t n0 = vld1q_f32((float*)vec);
+
+    for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
+        {
+            shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
+            aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
+            indexn = n0;
+            for(n = 0; n < neon_iters; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
+                    __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
+                    aux = vmulq_f32(code_phase_step_chips_reg, indexn);
+                    aux = vaddq_f32(aux, aux2);
+
+                    //floor
+                    i = vcvtq_s32_f32(aux);
+                    fi = vcvtq_f32_s32(i);
+                    igx = vcgtq_f32(fi, aux);
+                    j = vcvtq_f32_s32(vandq_s32(vreinterpretq_s32_u32(igx), ones));
+                    aux = vsubq_f32(fi, j);
+
+                    // fmod
+                    c = vmulq_f32(aux, reciprocal);
+                    i =  vcvtq_s32_f32(c);
+                    cTrunc = vcvtq_f32_s32(i);
+                    base = vmulq_f32(cTrunc, code_length_chips_reg_f);
+                    aux = vsubq_f32(aux, base);
+                    local_code_chip_index_reg = vcvtq_s32_f32(aux);
+
+                    negatives = vreinterpretq_s32_u32(vcltq_s32(local_code_chip_index_reg, zeros));
+                    aux_i = vandq_s32(code_length_chips_reg_i, negatives);
+                    local_code_chip_index_reg = vaddq_s32(local_code_chip_index_reg, aux_i);
+
+                    vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
+
+                    for(k = 0; k < 4; ++k)
+                        {
+                            _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
+                        }
+                    indexn = vaddq_f32(indexn, fours);
+                }
+            for(n = neon_iters * 4; n < num_points; n++)
+                {
+                    __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
+                    // resample code for current tap
+                    local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
+                    //Take into account that in multitap correlators, the shifts can be negative!
+                    if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
+                    local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
+                    _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
+                }
+        }
+}
+
+#endif
+
+#endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/
+
+
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h
new file mode 100644
index 000000000..544a83990
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h
@@ -0,0 +1,320 @@
+/*!
+ * \file volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h
+ * \brief VOLK_GNSSSDR kernel: multiplies N complex (32-bit float per component) vectors
+ * by a common vector, phase rotated and accumulates the results in N float complex outputs.
+ * \authors <ul>
+ *          <li> Cillian O'Driscoll 2016. cillian.odriscoll(at)gmail.com
+ *          </ul>
+ *
+ * VOLK_GNSSSDR kernel that multiplies N 32 bits complex vectors by a common vector, which is
+ * phase-rotated by phase offset and phase increment, and accumulates the results
+ * in N 32 bits float complex outputs.
+ * It is optimized to perform the N tap correlation process in GNSS receivers.
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2016  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+/*!
+ * \page volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn
+ *
+ * \b Overview
+ *
+ * Rotates and multiplies the reference complex vector with an arbitrary number of other real vectors,
+ * accumulates the results and stores them in the output vector.
+ * The rotation is done at a fixed rate per sample, from an initial \p phase offset.
+ * This function can be used for Doppler wipe-off and multiple correlator.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+ * void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points);
+ * \endcode
+ *
+ * \b Inputs
+ * \li in_common:     Pointer to one of the vectors to be rotated, multiplied and accumulated (reference vector).
+ * \li phase_inc:     Phase increment = lv_cmake(cos(phase_step_rad), sin(phase_step_rad))
+ * \li phase:         Initial phase = lv_cmake(cos(initial_phase_rad), sin(initial_phase_rad))
+ * \li in_a:          Pointer to an array of pointers to multiple vectors to be multiplied and accumulated.
+ * \li num_a_vectors: Number of vectors to be multiplied by the reference vector and accumulated.
+ * \li num_points:    Number of complex values to be multiplied together, accumulated and stored into \p result.
+ *
+ * \b Outputs
+ * \li phase:         Final phase.
+ * \li result:        Vector of \p num_a_vectors components with the multiple vectors of \p in_a rotated, multiplied by \p in_common and accumulated.
+ *
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H
+#define INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H
+
+
+#include <volk_gnsssdr/volk_gnsssdr.h>
+#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
+#include <volk_gnsssdr/volk_gnsssdr_complex.h>
+#include <volk_gnsssdr/saturation_arithmetic.h>
+#include <math.h>
+#include <stdio.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
+{
+    lv_32fc_t tmp32_1, tmp32_2;
+    int n_vec;
+    unsigned int n;
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            result[n_vec] = lv_cmake(0,0);
+        }
+    for (n = 0; n < num_points; n++)
+        {
+            tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+
+            // Regenerate phase
+            if (n % 256 == 0)
+                {
+                    //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+#ifdef __cplusplus
+                    (*phase) /= std::abs((*phase));
+#else
+                    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+#endif
+                    //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+                }
+
+            (*phase) *= phase_inc;
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    tmp32_2 = tmp32_1 * in_a[n_vec][n];
+                    result[n_vec] += tmp32_2;
+                }
+        }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
+{
+    lv_32fc_t tmp32_1, tmp32_2;
+    const unsigned int ROTATOR_RELOAD = 256;
+    int n_vec;
+    unsigned int n;
+    unsigned int j;
+    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            result[n_vec] = lv_cmake(0,0);
+        }
+
+    for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
+        {
+            for (j = 0; j < ROTATOR_RELOAD; j++)
+                {
+                    tmp32_1 = *in_common++ * (*phase);
+                    (*phase) *= phase_inc;
+                    for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                        {
+                            tmp32_2 = tmp32_1 * in_a[n_vec][n * ROTATOR_RELOAD + j];
+                            result[n_vec] += tmp32_2;
+                        }
+                }
+            /* Regenerate phase */
+#ifdef __cplusplus
+            (*phase) /= std::abs((*phase));
+#else
+            //(*phase) /= cabsf((*phase));
+            (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+#endif
+        }
+
+    for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
+        {
+            tmp32_1 = *in_common++ * (*phase);
+            (*phase) *= phase_inc;
+            for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    tmp32_2 = tmp32_1 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j];
+                    result[n_vec] += tmp32_2;
+                }
+        }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+#include <volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h>
+static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points)
+{
+    unsigned int number = 0;
+    unsigned int vec_ind = 0;
+    unsigned int i = 0;
+    const unsigned int sixteenthPoints = num_points / 16;
+
+    const float* aPtr = (float*)in_common;
+    const float* bPtr[ num_a_vectors];
+    for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ){
+        bPtr[vec_ind] = in_a[vec_ind];
+    }
+
+    lv_32fc_t _phase = (*phase);
+    lv_32fc_t wo;
+
+    __m256 a0Val, a1Val, a2Val, a3Val;
+    __m256 b0Val[num_a_vectors], b1Val[num_a_vectors], b2Val[num_a_vectors], b3Val[num_a_vectors];
+    __m256 x0Val[num_a_vectors], x1Val[num_a_vectors], x0loVal[num_a_vectors], x0hiVal[num_a_vectors], x1loVal[num_a_vectors], x1hiVal[num_a_vectors];
+    __m256 c0Val[num_a_vectors], c1Val[num_a_vectors], c2Val[num_a_vectors], c3Val[num_a_vectors];
+
+    __m256 dotProdVal0[num_a_vectors];
+    __m256 dotProdVal1[num_a_vectors];
+    __m256 dotProdVal2[num_a_vectors];
+    __m256 dotProdVal3[num_a_vectors];
+
+    for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ ){
+        dotProdVal0[vec_ind] = _mm256_setzero_ps();
+        dotProdVal1[vec_ind] = _mm256_setzero_ps();
+        dotProdVal2[vec_ind] = _mm256_setzero_ps();
+        dotProdVal3[vec_ind] = _mm256_setzero_ps();
+    }
+
+    // Set up the complex rotator
+    __m256 z0, z1, z2, z3;
+    __attribute__((aligned(32))) lv_32fc_t phase_vec[16];
+    for( vec_ind = 0; vec_ind < 16; ++vec_ind ){
+        phase_vec[vec_ind] = _phase;
+        _phase *= phase_inc;
+    }
+
+    z0 = _mm256_load_ps( (float *)phase_vec );
+    z1 = _mm256_load_ps( (float *)(phase_vec + 4) );
+    z2 = _mm256_load_ps( (float *)(phase_vec + 8) );
+    z3 = _mm256_load_ps( (float *)(phase_vec + 12) );
+
+    lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16;
+
+    for( vec_ind = 0; vec_ind < 4; ++vec_ind ){
+        phase_vec[vec_ind] = dz;
+    }
+
+    __m256 dz_reg = _mm256_load_ps( (float *)phase_vec );
+    dz_reg = _mm256_complexnormalise_ps( dz_reg );
+
+    for(;number < sixteenthPoints; number++){
+
+        a0Val = _mm256_loadu_ps(aPtr);
+        a1Val = _mm256_loadu_ps(aPtr+8);
+        a2Val = _mm256_loadu_ps(aPtr+16);
+        a3Val = _mm256_loadu_ps(aPtr+24);
+
+        a0Val = _mm256_complexmul_ps( a0Val, z0 );
+        a1Val = _mm256_complexmul_ps( a1Val, z1 );
+        a2Val = _mm256_complexmul_ps( a2Val, z2 );
+        a3Val = _mm256_complexmul_ps( a3Val, z3 );
+
+        z0 = _mm256_complexmul_ps( z0, dz_reg );
+        z1 = _mm256_complexmul_ps( z1, dz_reg );
+        z2 = _mm256_complexmul_ps( z2, dz_reg );
+        z3 = _mm256_complexmul_ps( z3, dz_reg );
+
+
+        for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ){
+            x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
+            x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8);
+            x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
+            x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
+            x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]);
+            x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]);
+
+            // TODO: it may be possible to rearrange swizzling to better pipeline data
+            b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
+            b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
+            b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20);
+            b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31);
+
+            c0Val[vec_ind] = _mm256_mul_ps(a0Val, b0Val[vec_ind]);
+            c1Val[vec_ind] = _mm256_mul_ps(a1Val, b1Val[vec_ind]);
+            c2Val[vec_ind] = _mm256_mul_ps(a2Val, b2Val[vec_ind]);
+            c3Val[vec_ind] = _mm256_mul_ps(a3Val, b3Val[vec_ind]);
+
+            dotProdVal0[vec_ind] = _mm256_add_ps(c0Val[vec_ind], dotProdVal0[vec_ind]);
+            dotProdVal1[vec_ind] = _mm256_add_ps(c1Val[vec_ind], dotProdVal1[vec_ind]);
+            dotProdVal2[vec_ind] = _mm256_add_ps(c2Val[vec_ind], dotProdVal2[vec_ind]);
+            dotProdVal3[vec_ind] = _mm256_add_ps(c3Val[vec_ind], dotProdVal3[vec_ind]);
+
+            bPtr[vec_ind] += 16;
+        }
+
+        // Force the rotators back onto the unit circle
+        if ((number % 64) == 0)
+        {
+            z0 = _mm256_complexnormalise_ps( z0 );
+            z1 = _mm256_complexnormalise_ps( z1 );
+            z2 = _mm256_complexnormalise_ps( z2 );
+            z3 = _mm256_complexnormalise_ps( z3 );
+        }
+
+        aPtr += 32;
+    }
+    __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+    for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ){
+        dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]);
+        dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]);
+        dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]);
+
+        _mm256_store_ps((float *)dotProductVector,dotProdVal0[vec_ind]); // Store the results back into the dot product vector
+
+        result[ vec_ind ] = lv_cmake( 0, 0 );
+        for( i = 0; i < 4; ++i ){
+            result[vec_ind] += dotProductVector[i];
+        }
+    }
+
+    z0 = _mm256_complexnormalise_ps( z0 );
+    _mm256_store_ps((float*)phase_vec, z0);
+    _phase  = phase_vec[0];
+    _mm256_zeroupper();
+
+
+    number = sixteenthPoints*16;
+    for(;number < num_points; number++){
+        wo = (*aPtr++)*_phase;
+        _phase *= phase_inc;
+
+        for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ){
+            result[vec_ind] += wo * in_a[vec_ind][number];
+        }
+    }
+
+    *phase = _phase;
+
+}
+
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */
+
+
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h
new file mode 100644
index 000000000..a0284f65d
--- /dev/null
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h
@@ -0,0 +1,132 @@
+/*!
+ * \file volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h
+ * \brief Volk puppet for the multiple 16-bit complex dot product kernel.
+ * \authors <ul>
+ *          <li> Carles Fernandez Prades 2016 cfernandez at cttc dot cat
+ *          </ul>
+ *
+ * Volk puppet for integrating the resampler into volk's test system
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H
+#define INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H
+
+#include "volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h"
+#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
+#include <volk_gnsssdr/volk_gnsssdr.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code,  const float* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.25;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    unsigned int n;
+    int num_a_vectors = 3;
+    float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
+        }
+    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
+
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+#endif  // Generic
+
+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code,  const float* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.25;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    unsigned int n;
+    int num_a_vectors = 3;
+    float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
+        }
+    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
+
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif  // Generic
+
+#ifdef LV_HAVE_AVX
+static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code,  const float* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.25;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+    unsigned int n;
+    int num_a_vectors = 3;
+    float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
+        }
+    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
+
+    for(n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif  // AVX
+
+#endif  // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H
+
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h
index f89a1461b..9348c09fc 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h
@@ -46,8 +46,8 @@
 #ifdef LV_HAVE_GENERIC
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -70,14 +70,15 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
     volk_gnsssdr_free(result_aux);
 }
 
+
 #endif /* LV_HAVE_GENERIC */
 
 
 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -105,8 +106,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* re
 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -135,8 +136,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re
 #ifdef LV_HAVE_SSE4_1
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -164,8 +165,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t*
 #ifdef LV_HAVE_SSE4_1
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -193,8 +194,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t*
 #ifdef LV_HAVE_AVX
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -222,8 +223,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* res
 #ifdef LV_HAVE_AVX
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -251,8 +252,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
 #ifdef LV_HAVE_AVX2
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -280,8 +281,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* re
 #ifdef LV_HAVE_AVX2
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
@@ -309,8 +310,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* re
 #ifdef LV_HAVE_NEON
 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
 {
-    float code_phase_step_chips = -0.6;
-    int code_length_chips = 1023;
+    int code_length_chips = 2046;
+    float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
     int num_out_vectors = 3;
     float rem_code_phase_chips = -0.234;
     unsigned int n;
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h
index de3284a06..da0b2f0f8 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h
@@ -89,10 +89,14 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
         (VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerfastpuppet_16ic, volk_gnsssdr_16ic_resampler_fast_16ic, test_params))
         (VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic, volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn, test_params))
         (VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerxnpuppet_16ic, volk_gnsssdr_16ic_xn_resampler_16ic_xn, test_params))
+        (VOLK_INIT_PUPP(volk_gnsssdr_16i_resamplerxnpuppet_16i, volk_gnsssdr_16i_xn_resampler_16i_xn, test_params))
         (VOLK_INIT_PUPP(volk_gnsssdr_32fc_resamplerxnpuppet_32fc, volk_gnsssdr_32fc_xn_resampler_32fc_xn, test_params))
+        (VOLK_INIT_PUPP(volk_gnsssdr_32f_resamplerxnpuppet_32f, volk_gnsssdr_32f_xn_resampler_32f_xn, test_params))
         (VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_dot_prod_16ic_xn, test_params))
         (VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16))
+        (VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16))
         (VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1))
+        (VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1))
         ;
 
     return test_cases;
diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc
index fc0518a65..8019b3a8a 100644
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc
@@ -717,7 +717,7 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
                                             {
                                                 if(both_sigs[j].is_signed)
                                                     {
-                                                        fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
+                                                        fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
                                                     }
                                                 else
                                                     {

From e87522880e290652c3a53e29a1add2670a9eab92 Mon Sep 17 00:00:00 2001
From: Cillian O'Driscoll <cillian.odriscoll@gmail.com>
Date: Mon, 11 Sep 2017 15:21:05 +0100
Subject: [PATCH 2/4] Added ability to generate real valued codes

Only done for GPS L1 C/A and Galileo E1 OS for now. Also added a
cpu_multicorrelator_real_codes class that performs code correlation
using real-valued local codes
---
 .../libs/galileo_e1_signal_processing.cc      |  83 ++++++---
 .../libs/galileo_e1_signal_processing.h       |  28 +--
 src/algorithms/libs/gnss_signal_processing.cc |  22 +++
 src/algorithms/libs/gnss_signal_processing.h  |   7 +
 .../libs/gps_sdr_signal_processing.cc         |  33 +++-
 .../libs/gps_sdr_signal_processing.h          |   6 +
 src/algorithms/tracking/libs/CMakeLists.txt   |   1 +
 .../libs/cpu_multicorrelator_real_codes.cc    | 147 +++++++++++++++
 .../libs/cpu_multicorrelator_real_codes.h     |  70 +++++++
 .../cpu_multicorrelator_real_codes_test.cc    | 174 ++++++++++++++++++
 .../tracking/cpu_multicorrelator_test.cc      |   3 +
 11 files changed, 530 insertions(+), 44 deletions(-)
 create mode 100644 src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc
 create mode 100644 src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h
 create mode 100644 src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc

diff --git a/src/algorithms/libs/galileo_e1_signal_processing.cc b/src/algorithms/libs/galileo_e1_signal_processing.cc
index 93fef2f50..46738faad 100644
--- a/src/algorithms/libs/galileo_e1_signal_processing.cc
+++ b/src/algorithms/libs/galileo_e1_signal_processing.cc
@@ -73,7 +73,7 @@ void galileo_e1_code_gen_int(int* _dest, char _Signal[3], signed int _prn)
 
 
 
-void galileo_e1_sinboc_11_gen(std::complex<float>* _dest, int* _prn, unsigned int _length_out)
+void galileo_e1_sinboc_11_gen_int(int* _dest, int* _prn, unsigned int _length_out)
 {
     const unsigned int _length_in = Galileo_E1_B_CODE_LENGTH_CHIPS;
     unsigned int _period = static_cast<unsigned int>( _length_out / _length_in );
@@ -81,18 +81,18 @@ void galileo_e1_sinboc_11_gen(std::complex<float>* _dest, int* _prn, unsigned in
         {
             for (unsigned int j = 0; j < (_period / 2); j++)
                 {
-                    _dest[i * _period + j] = std::complex<float>(static_cast<float>(_prn[i]), 0.0);
+                    _dest[i * _period + j] = _prn[i];
                 }
             for (unsigned int j = (_period / 2); j < _period; j++)
                 {
-                    _dest[i * _period + j] = std::complex<float>(static_cast<float>(- _prn[i]), 0.0);
+                    _dest[i * _period + j] = - _prn[i];
                 }
         }
 }
 
 
 
-void galileo_e1_sinboc_61_gen(std::complex<float>* _dest, int* _prn, unsigned int _length_out)
+void galileo_e1_sinboc_61_gen_int(int* _dest, int* _prn, unsigned int _length_out)
 {
     const unsigned int _length_in = Galileo_E1_B_CODE_LENGTH_CHIPS;
     unsigned int _period = static_cast<unsigned int>(_length_out / _length_in);
@@ -101,42 +101,44 @@ void galileo_e1_sinboc_61_gen(std::complex<float>* _dest, int* _prn, unsigned in
         {
             for (unsigned int j = 0; j < _period; j += 2)
                 {
-                    _dest[i * _period + j] = std::complex<float>(static_cast<float>(_prn[i]), 0.0);
+                    _dest[i * _period + j] = _prn[i];
                 }
             for (unsigned int j = 1; j < _period; j += 2)
                 {
-                    _dest[i * _period + j] = std::complex<float>(static_cast<float>(- _prn[i]), 0.0);
+                    _dest[i * _period + j] = - _prn[i];
                 }
         }
 }
 
 
 
-void galileo_e1_gen(std::complex<float>* _dest, int* _prn, char _Signal[3])
+void galileo_e1_gen_float(float* _dest, int* _prn, char _Signal[3])
 {
     std::string _galileo_signal = _Signal;
     const unsigned int _codeLength = 12 * Galileo_E1_B_CODE_LENGTH_CHIPS;
     const float alpha = sqrt(10.0 / 11.0);
     const float beta = sqrt(1.0 / 11.0);
 
-    std::complex<float> sinboc_11[12 * 4092]; //  _codeLength not accepted by Clang
-    std::complex<float> sinboc_61[12 * 4092];
+    int sinboc_11[12 * 4092]; //  _codeLength not accepted by Clang
+    int sinboc_61[12 * 4092];
 
-    galileo_e1_sinboc_11_gen(sinboc_11, _prn, _codeLength); //generate sinboc(1,1) 12 samples per chip
-    galileo_e1_sinboc_61_gen(sinboc_61, _prn, _codeLength); //generate sinboc(6,1) 12 samples per chip
+    galileo_e1_sinboc_11_gen_int(sinboc_11, _prn, _codeLength); //generate sinboc(1,1) 12 samples per chip
+    galileo_e1_sinboc_61_gen_int(sinboc_61, _prn, _codeLength); //generate sinboc(6,1) 12 samples per chip
 
     if (_galileo_signal.rfind("1B") != std::string::npos && _galileo_signal.length() >= 2)
         {
             for (unsigned int i = 0; i < _codeLength; i++)
                 {
-                    _dest[i] = alpha * sinboc_11[i] + beta * sinboc_61[i];
+                    _dest[i] = alpha * static_cast<float>(sinboc_11[i]) + 
+                        beta * static_cast<float>(sinboc_61[i]);
                 }
         }
     else if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2)
         {
             for (unsigned int i = 0; i < _codeLength; i++)
                 {
-                    _dest[i] = alpha * sinboc_11[i] - beta * sinboc_61[i];
+                    _dest[i] = alpha * static_cast<float>(sinboc_11[i]) - 
+                        beta * static_cast<float>(sinboc_61[i]);
                 }
         }
     else
@@ -144,8 +146,7 @@ void galileo_e1_gen(std::complex<float>* _dest, int* _prn, char _Signal[3])
 }
 
 
-
-void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signal[3],
+void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3],
         bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift,
         bool _secondary_flag)
 {
@@ -164,23 +165,29 @@ void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signa
 
     galileo_e1_code_gen_int(primary_code_E1_chips, _Signal, _prn); //generate Galileo E1 code, 1 sample per chip
 
-    std::complex<float>* _signal_E1;
+    float* _signal_E1;
 
     _codeLength = _samplesPerChip * Galileo_E1_B_CODE_LENGTH_CHIPS;
-    _signal_E1 = new std::complex<float>[_codeLength];
+    _signal_E1 = new float[_codeLength];
 
     if (_cboc == true)
         {
-            galileo_e1_gen(_signal_E1, primary_code_E1_chips, _Signal); //generate cboc 12 samples per chip
+            galileo_e1_gen_float(_signal_E1, primary_code_E1_chips, _Signal); //generate cboc 12 samples per chip
         }
     else
         {
-            galileo_e1_sinboc_11_gen(_signal_E1, primary_code_E1_chips, _codeLength); //generate sinboc(1,1) 2 samples per chip
+            int _signal_E1_int[_codeLength];
+            galileo_e1_sinboc_11_gen_int(_signal_E1_int, primary_code_E1_chips, _codeLength); //generate sinboc(1,1) 2 samples per chip
+
+            for( unsigned int ii = 0; ii < _codeLength; ++ii )
+                {
+                    _signal_E1[ii] = static_cast< float >( _signal_E1_int[ii] );
+                }
         }
 
     if (_fs != _samplesPerChip * _codeFreqBasis)
         {
-            std::complex<float>* _resampled_signal = new std::complex<float>[_samplesPerCode];
+            float* _resampled_signal = new float[_samplesPerCode];
             resampler(_signal_E1, _resampled_signal, _samplesPerChip * _codeFreqBasis, _fs,
                     _codeLength, _samplesPerCode); //resamples code to fs
 
@@ -192,8 +199,8 @@ void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signa
     if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2 && _secondary_flag)
     {
 
-        std::complex<float>* _signal_E1C_secondary = new std::complex<float>
-                                                    [static_cast<int>(Galileo_E1_C_SECONDARY_CODE_LENGTH)
+        float* _signal_E1C_secondary = new float[
+            static_cast<int>(Galileo_E1_C_SECONDARY_CODE_LENGTH)
                                                     * _samplesPerCode];
 
         for (unsigned int i = 0; i < static_cast<unsigned int>(Galileo_E1_C_SECONDARY_CODE_LENGTH); i++)
@@ -202,7 +209,7 @@ void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signa
                     {
                         _signal_E1C_secondary[i*_samplesPerCode + k] = _signal_E1[k]
                                 * (Galileo_E1_C_SECONDARY_CODE.at(i) == '0'
-                                   ? std::complex<float>(1,0) : std::complex<float>(-1,0));
+                                   ? 1.0f : -1.0f);
                     }
             }
 
@@ -220,6 +227,36 @@ void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signa
     delete[] _signal_E1;
 }
 
+void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signal[3],
+        bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift,
+        bool _secondary_flag)
+{
+    std::string _galileo_signal = _Signal;
+    const int _codeFreqBasis = Galileo_E1_CODE_CHIP_RATE_HZ; //Hz
+    unsigned int _samplesPerCode = static_cast<unsigned int>( 
+            static_cast<double>(_fs) / (static_cast<double>(_codeFreqBasis ) 
+                / static_cast<double>(Galileo_E1_B_CODE_LENGTH_CHIPS)));
+
+    if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2 && _secondary_flag)
+    {
+        _samplesPerCode *= static_cast<int>(Galileo_E1_C_SECONDARY_CODE_LENGTH);
+    }
+
+    float real_code[_samplesPerCode];
+
+    galileo_e1_code_gen_float_sampled( real_code, _Signal, _cboc, _prn, _fs, _chip_shift, _secondary_flag );
+
+    for( unsigned int ii = 0; ii < _samplesPerCode; ++ii )
+        {
+            _dest[ii] = std::complex< float >( real_code[ii], 0.0f );
+        }
+}
+
+void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3],
+        bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift)
+{
+    galileo_e1_code_gen_float_sampled(_dest, _Signal, _cboc, _prn, _fs, _chip_shift, false);
+}
 
 void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signal[3],
         bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift)
diff --git a/src/algorithms/libs/galileo_e1_signal_processing.h b/src/algorithms/libs/galileo_e1_signal_processing.h
index d7518ab04..f7c670129 100644
--- a/src/algorithms/libs/galileo_e1_signal_processing.h
+++ b/src/algorithms/libs/galileo_e1_signal_processing.h
@@ -36,30 +36,22 @@
 
 
 /*!
- * \brief This function generates Galileo E1 code (one sample per chip).
+ * \brief This function generates Galileo E1 code (can select E1B or E1C, cboc or sinboc
+ * and the sample frequency _fs).
  *
  */
-void galileo_e1_code_gen_int(int* _dest, char _Signal[3], signed int _prn);
+void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3],
+        bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift,
+        bool _secondary_flag);
 
 /*!
- * \brief This function generates Galileo E1 sinboc(1,1) code (minimum 2 samples per chip),
- * the _codeLength variable must be a multiple of 2*4092.
+ * \brief This function generates Galileo E1 code (can select E1B or E1C, cboc or sinboc
+ * and the sample frequency _fs).
  *
  */
-void galileo_e1_sinboc_11_gen(std::complex<float>* _dest, int* _prn,
-        unsigned int _codeLength);
-/*!
- * \brief This function generates Galileo E1 sinboc(6,1) code (minimum 12 samples per chip),
- * the _codeLength variable must be a multiple of 12*4092.
- *
- */
-void galileo_e1_sinboc_61_gen(std::complex<float>* _dest, int* _prn,
-        unsigned int _codeLength);
-/*!
- * \brief This function generates Galileo E1 cboc code (12 samples per chip).
- *
- */
-void galileo_e1_cboc_gen(std::complex<float>* _dest, int* _prn, char _Signal[3]);
+void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3],
+        bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift);
+
 /*!
  * \brief This function generates Galileo E1 code (can select E1B or E1C, cboc or sinboc
  * and the sample frequency _fs).
diff --git a/src/algorithms/libs/gnss_signal_processing.cc b/src/algorithms/libs/gnss_signal_processing.cc
index 86738f062..9666b1d0d 100644
--- a/src/algorithms/libs/gnss_signal_processing.cc
+++ b/src/algorithms/libs/gnss_signal_processing.cc
@@ -156,6 +156,28 @@ void hex_to_binary_converter(int * _dest, char _from)
     }
 }
 
+void resampler(float* _from, float* _dest, float _fs_in,
+        float _fs_out, unsigned int _length_in, unsigned int _length_out)
+{
+    unsigned int _codeValueIndex;
+    float aux;
+    //--- Find time constants --------------------------------------------------
+    const float _t_in = 1 / _fs_in;  // Incoming sampling  period in sec
+    const float _t_out = 1 / _fs_out;   // Out sampling period in sec
+    for (unsigned int i = 0; i < _length_out - 1; i++)
+        {
+            //=== Digitizing =======================================================
+            //--- compute index array to read sampled values -------------------------
+            //_codeValueIndex = ceil((_t_out * ((float)i + 1)) / _t_in) - 1;
+            aux = (_t_out * (i + 1)) / _t_in;
+            _codeValueIndex = auxCeil2(aux) - 1;
+
+            //if repeat the chip -> upsample by nearest neighborhood interpolation
+            _dest[i] = _from[_codeValueIndex];
+        }
+    //--- Correct the last index (due to number rounding issues) -----------
+    _dest[_length_out - 1] = _from[_length_in - 1];
+}
 
 void resampler(std::complex<float>* _from, std::complex<float>* _dest, float _fs_in,
         float _fs_out, unsigned int _length_in, unsigned int _length_out)
diff --git a/src/algorithms/libs/gnss_signal_processing.h b/src/algorithms/libs/gnss_signal_processing.h
index 2449dfaea..b75cb71c5 100644
--- a/src/algorithms/libs/gnss_signal_processing.h
+++ b/src/algorithms/libs/gnss_signal_processing.h
@@ -60,6 +60,13 @@ void complex_exp_gen_conj(std::complex<float>* _dest, double _f, double _fs,
  */
 void hex_to_binary_converter(int * _dest, char _from);
 
+/*!
+ * \brief This function resamples a sequence of float values.
+ *
+ */
+void resampler(float* _from, float* _dest,
+        float _fs_in, float _fs_out, unsigned int _length_in,
+        unsigned int _length_out);
 /*!
  * \brief This function resamples a sequence of complex values.
  *
diff --git a/src/algorithms/libs/gps_sdr_signal_processing.cc b/src/algorithms/libs/gps_sdr_signal_processing.cc
index a88ea5119..168f8af97 100644
--- a/src/algorithms/libs/gps_sdr_signal_processing.cc
+++ b/src/algorithms/libs/gps_sdr_signal_processing.cc
@@ -34,7 +34,7 @@
 
 auto auxCeil = [](float x){ return static_cast<int>(static_cast<long>((x)+1)); };
 
-void gps_l1_ca_code_gen_complex(std::complex<float>* _dest, signed int _prn, unsigned int _chip_shift)
+void gps_l1_ca_code_gen_int(int* _dest, signed int _prn, unsigned int _chip_shift)
 {
     const unsigned int _code_length = 1023;
     bool G1[_code_length];
@@ -102,17 +102,44 @@ void gps_l1_ca_code_gen_complex(std::complex<float>* _dest, signed int _prn, uns
             aux = G1[(lcv + _chip_shift) % _code_length]^G2[delay];
             if(aux == true)
                 {
-                    _dest[lcv] = std::complex<float>(1, 0);
+                    _dest[lcv] = 1;
                 }
             else
                 {
-                    _dest[lcv] = std::complex<float>(-1, 0);
+                    _dest[lcv] = -1;
                 }
             delay++;
             delay %= _code_length;
         }
 }
 
+void gps_l1_ca_code_gen_float(float* _dest, signed int _prn, unsigned int _chip_shift)
+{
+    unsigned int _code_length = 1023;
+    int ca_code_int[ _code_length ];
+
+    gps_l1_ca_code_gen_int( ca_code_int, _prn, _chip_shift );
+
+    for( unsigned int ii = 0; ii < _code_length; ++ii )
+    {
+        _dest[ii] = static_cast<float>( ca_code_int[ii] );
+    }
+
+}
+
+void gps_l1_ca_code_gen_complex(std::complex<float>* _dest, signed int _prn, unsigned int _chip_shift)
+{
+    unsigned int _code_length = 1023;
+    int ca_code_int[ _code_length ];
+
+    gps_l1_ca_code_gen_int( ca_code_int, _prn, _chip_shift );
+
+    for( unsigned int ii = 0; ii < _code_length; ++ii )
+    {
+        _dest[ii] = std::complex< float >( static_cast<float>(ca_code_int[ii]), 0.0f );
+    }
+
+}
 
 
 /*
diff --git a/src/algorithms/libs/gps_sdr_signal_processing.h b/src/algorithms/libs/gps_sdr_signal_processing.h
index 2f01b5647..bd41ca8a3 100644
--- a/src/algorithms/libs/gps_sdr_signal_processing.h
+++ b/src/algorithms/libs/gps_sdr_signal_processing.h
@@ -35,6 +35,12 @@
 
 #include <complex>
 
+//!Generates int GPS L1 C/A code for the desired SV ID and code shift 
+void gps_l1_ca_code_gen_int(int* _dest, signed int _prn, unsigned int _chip_shift);
+
+//!Generates float GPS L1 C/A code for the desired SV ID and code shift 
+void gps_l1_ca_code_gen_float(float* _dest, signed int _prn, unsigned int _chip_shift);
+
 //!Generates complex GPS L1 C/A code for the desired SV ID and code shift, and sampled to specific sampling frequency
 void gps_l1_ca_code_gen_complex(std::complex<float>* _dest, signed int _prn, unsigned int _chip_shift);
 
diff --git a/src/algorithms/tracking/libs/CMakeLists.txt b/src/algorithms/tracking/libs/CMakeLists.txt
index 88236920a..19387889e 100644
--- a/src/algorithms/tracking/libs/CMakeLists.txt
+++ b/src/algorithms/tracking/libs/CMakeLists.txt
@@ -33,6 +33,7 @@ endif(ENABLE_CUDA)
 
 set(TRACKING_LIB_SOURCES   
      cpu_multicorrelator.cc
+     cpu_multicorrelator_real_codes.cc
      cpu_multicorrelator_16sc.cc     
      lock_detectors.cc
      tcp_communication.cc
diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc
new file mode 100644
index 000000000..cc0df391c
--- /dev/null
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc
@@ -0,0 +1,147 @@
+/*!
+ * \file cpu_multicorrelator_real_codes.cc
+ * \brief High optimized CPU vector multiTAP correlator class with real-valued local codes
+ * \authors <ul>
+ *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
+ *          <li> Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com
+ *          </ul>
+ *
+ * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2017  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include "cpu_multicorrelator_real_codes.h"
+#include <cmath>
+#include <iostream>
+#include <volk_gnsssdr/volk_gnsssdr.h>
+
+
+cpu_multicorrelator_real_codes::cpu_multicorrelator_real_codes()
+{
+    d_sig_in = nullptr;
+    d_local_code_in = nullptr;
+    d_shifts_chips = nullptr;
+    d_corr_out = nullptr;
+    d_local_codes_resampled = nullptr;
+    d_code_length_chips = 0;
+    d_n_correlators = 0;
+}
+
+
+cpu_multicorrelator_real_codes::~cpu_multicorrelator_real_codes()
+{
+    if(d_local_codes_resampled != nullptr)
+        {
+            cpu_multicorrelator_real_codes::free();
+        }
+}
+
+
+bool cpu_multicorrelator_real_codes::init(
+        int max_signal_length_samples,
+        int n_correlators)
+{
+    // ALLOCATE MEMORY FOR INTERNAL vectors
+    size_t size = max_signal_length_samples * sizeof(float);
+
+    d_local_codes_resampled = static_cast<float**>(volk_gnsssdr_malloc(n_correlators * sizeof(float*), volk_gnsssdr_get_alignment()));
+    for (int n = 0; n < n_correlators; n++)
+        {
+            d_local_codes_resampled[n] = static_cast<float*>(volk_gnsssdr_malloc(size, volk_gnsssdr_get_alignment()));
+        }
+    d_n_correlators = n_correlators;
+    return true;
+}
+
+
+
+bool cpu_multicorrelator_real_codes::set_local_code_and_taps(
+        int code_length_chips,
+        const float* local_code_in,
+        float *shifts_chips)
+{
+    d_local_code_in = local_code_in;
+    d_shifts_chips = shifts_chips;
+    d_code_length_chips = code_length_chips;
+    return true;
+}
+
+
+bool cpu_multicorrelator_real_codes::set_input_output_vectors(std::complex<float>* corr_out, const std::complex<float>* sig_in)
+{
+    // Save CPU pointers
+    d_sig_in = sig_in;
+    d_corr_out = corr_out;
+    return true;
+}
+
+
+void cpu_multicorrelator_real_codes::update_local_code(int correlator_length_samples, float rem_code_phase_chips, float code_phase_step_chips)
+{
+    volk_gnsssdr_32f_xn_resampler_32f_xn(d_local_codes_resampled,
+            d_local_code_in,
+            rem_code_phase_chips,
+            code_phase_step_chips,
+            d_shifts_chips,
+            d_code_length_chips,
+            d_n_correlators,
+            correlator_length_samples);
+}
+
+
+bool cpu_multicorrelator_real_codes::Carrier_wipeoff_multicorrelator_resampler(
+        float rem_carrier_phase_in_rad,
+        float phase_step_rad,
+        float rem_code_phase_chips,
+        float code_phase_step_chips,
+        int signal_length_samples)
+{
+    update_local_code(signal_length_samples, rem_code_phase_chips, code_phase_step_chips);
+    // Regenerate phase at each call in order to avoid numerical issues
+    lv_32fc_t phase_offset_as_complex[1];
+    phase_offset_as_complex[0] = lv_cmake(std::cos(rem_carrier_phase_in_rad), -std::sin(rem_carrier_phase_in_rad));
+    // call VOLK_GNSSSDR kernel
+    volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn(d_corr_out, d_sig_in, std::exp(lv_32fc_t(0, - phase_step_rad)), phase_offset_as_complex, (const float**)d_local_codes_resampled, d_n_correlators, signal_length_samples);
+    return true;
+}
+
+
+bool cpu_multicorrelator_real_codes::free()
+{
+    // Free memory
+    if (d_local_codes_resampled != nullptr)
+        {
+            for (int n = 0; n < d_n_correlators; n++)
+                {
+                    volk_gnsssdr_free(d_local_codes_resampled[n]);
+                }
+            volk_gnsssdr_free(d_local_codes_resampled);
+            d_local_codes_resampled = nullptr;
+        }
+    return true;
+}
+
+
diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h
new file mode 100644
index 000000000..922a12c7a
--- /dev/null
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h
@@ -0,0 +1,70 @@
+/*!
+ * \file cpu_multicorrelator_real_codes.h
+ * \brief High optimized CPU vector multiTAP correlator class using real-valued local codes
+ * \authors <ul>
+ *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
+ *          <li> Cillian O'Driscoll, 2017, cillian.odriscoll(at)gmail.com
+ *          </ul>
+ *
+ * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2017  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef GNSS_SDR_CPU_MULTICORRELATOR_REAL_CODES_H_
+#define GNSS_SDR_CPU_MULTICORRELATOR_REAL_CODES_H_
+
+
+#include <complex>
+
+/*!
+ * \brief Class that implements carrier wipe-off and correlators.
+ */
+class cpu_multicorrelator_real_codes
+{
+public:
+    cpu_multicorrelator_real_codes();
+    ~cpu_multicorrelator_real_codes();
+    bool init(int max_signal_length_samples, int n_correlators);
+    bool set_local_code_and_taps(int code_length_chips, const float* local_code_in, float *shifts_chips);
+    bool set_input_output_vectors(std::complex<float>* corr_out, const std::complex<float>* sig_in);
+    void update_local_code(int correlator_length_samples, float rem_code_phase_chips, float code_phase_step_chips);
+    bool Carrier_wipeoff_multicorrelator_resampler(float rem_carrier_phase_in_rad, float phase_step_rad, float rem_code_phase_chips, float code_phase_step_chips, int signal_length_samples);
+    bool free();
+
+private:
+    // Allocate the device input vectors
+    const std::complex<float> *d_sig_in;
+    float **d_local_codes_resampled;
+    const float *d_local_code_in;
+    std::complex<float> *d_corr_out;
+    float *d_shifts_chips;
+    int d_code_length_chips;
+    int d_n_correlators;
+};
+
+
+#endif /* CPU_MULTICORRELATOR_REAL_CODES_H_ */
+
diff --git a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc
new file mode 100644
index 000000000..100cc6985
--- /dev/null
+++ b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc
@@ -0,0 +1,174 @@
+/*!
+ * \file cpu_multicorrelator_real_codes_test.cc
+ * \brief  This file implements timing tests for the FFT.
+ * \author Carles Fernandez-Prades, 2016. cfernandez(at)cttc.es
+ *
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2016  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include <chrono>
+#include <complex>
+#include <random>
+#include <thread>
+#include <gtest/gtest.h>
+#include <gflags/gflags.h>
+#include <gnuradio/gr_complex.h>
+#include <volk_gnsssdr/volk_gnsssdr.h>
+#include "cpu_multicorrelator_real_codes.h"
+#include "gps_sdr_signal_processing.h"
+#include "GPS_L1_CA.h"
+
+
+DEFINE_int32(cpu_multicorrelator_real_codes_iterations_test, 1000, "Number of averaged iterations in CPU multicorrelator test timing test");
+DEFINE_int32(cpu_multicorrelator_real_codes_max_threads_test, 12, "Number of maximum concurrent correlators in CPU multicorrelator test timing test");
+
+void run_correlator_cpu(cpu_multicorrelator_real_codes* correlator,
+                    float d_rem_carrier_phase_rad,
+                    float d_carrier_phase_step_rad,
+                    float d_code_phase_step_chips,
+                    float d_rem_code_phase_chips,
+                    int correlation_size)
+{
+    for(int k = 0; k < FLAGS_cpu_multicorrelator_real_codes_iterations_test; k++)
+        {
+            correlator->Carrier_wipeoff_multicorrelator_resampler(d_rem_carrier_phase_rad,
+                    d_carrier_phase_step_rad,
+                    d_code_phase_step_chips,
+                    d_rem_code_phase_chips,
+                    correlation_size);
+        }
+}
+
+
+TEST(CpuMulticorrelatorRealCodesTest, MeasureExecutionTime)
+{
+    std::chrono::time_point<std::chrono::system_clock> start, end;
+    std::chrono::duration<double> elapsed_seconds(0);
+    int max_threads = FLAGS_cpu_multicorrelator_real_codes_max_threads_test;
+    std::vector<std::thread> thread_pool;
+    cpu_multicorrelator_real_codes* correlator_pool[max_threads];
+    unsigned int correlation_sizes [3] = { 2048, 4096, 8192};
+    double execution_times [3];
+
+    float* d_ca_code;
+    gr_complex* in_cpu;
+    gr_complex* d_correlator_outs;
+
+    int d_n_correlator_taps = 3;
+    int d_vector_length = correlation_sizes[2]; //max correlation size to allocate all the necessary memory
+    float* d_local_code_shift_chips;
+
+    //allocate host memory
+    // Get space for a vector with the C/A code replica sampled 1x/chip
+    d_ca_code = static_cast<float*>(volk_gnsssdr_malloc(static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(float), volk_gnsssdr_get_alignment()));
+    in_cpu = static_cast<gr_complex*>(volk_gnsssdr_malloc(2 * d_vector_length * sizeof(gr_complex), volk_gnsssdr_get_alignment()));
+
+    // correlator outputs (scalar)
+    d_n_correlator_taps = 3; // Early, Prompt, and Late
+    d_correlator_outs = static_cast<gr_complex*>(volk_gnsssdr_malloc(d_n_correlator_taps*sizeof(gr_complex), volk_gnsssdr_get_alignment()));
+    for (int n = 0; n < d_n_correlator_taps; n++)
+        {
+            d_correlator_outs[n] = gr_complex(0,0);
+        }
+    d_local_code_shift_chips = static_cast<float*>(volk_gnsssdr_malloc(d_n_correlator_taps*sizeof(float), volk_gnsssdr_get_alignment()));
+    // Set TAPs delay values [chips]
+    float d_early_late_spc_chips = 0.5;
+    d_local_code_shift_chips[0] = - d_early_late_spc_chips;
+    d_local_code_shift_chips[1] = 0.0;
+    d_local_code_shift_chips[2] = d_early_late_spc_chips;
+
+    //--- Perform initializations ------------------------------
+
+    //local code resampler on GPU
+    // generate local reference (1 sample per chip)
+    gps_l1_ca_code_gen_float(d_ca_code, 1, 0);
+    // generate inut signal
+    std::random_device r;
+    std::default_random_engine e1(r());
+    std::uniform_real_distribution<float> uniform_dist(0, 1);
+    for (int n = 0; n < 2 * d_vector_length; n++)
+        {
+            in_cpu[n] = std::complex<float>(uniform_dist(e1), uniform_dist(e1));
+        }
+
+    for (int n = 0; n < max_threads; n++)
+        {
+            correlator_pool[n] = new cpu_multicorrelator_real_codes();
+            correlator_pool[n]->init(d_vector_length, d_n_correlator_taps);
+            correlator_pool[n]->set_input_output_vectors(d_correlator_outs, in_cpu);
+            correlator_pool[n]->set_local_code_and_taps(static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS), d_ca_code, d_local_code_shift_chips);
+        }
+
+    float d_rem_carrier_phase_rad = 0.0;
+    float d_carrier_phase_step_rad = 0.1;
+    float d_code_phase_step_chips = 0.3;
+    float d_rem_code_phase_chips = 0.4;
+
+    EXPECT_NO_THROW(
+            for(int correlation_sizes_idx = 0; correlation_sizes_idx < 3; correlation_sizes_idx++)
+                {
+                    for(int current_max_threads = 1; current_max_threads < (max_threads+1); current_max_threads++)
+                        {
+                            std::cout << "Running " << current_max_threads << " concurrent correlators" << std::endl;
+                            start = std::chrono::system_clock::now();
+                            //create the concurrent correlator threads
+                            for (int current_thread = 0; current_thread < current_max_threads; current_thread++)
+                                {
+                                    thread_pool.push_back(std::thread(run_correlator_cpu,
+                                            correlator_pool[current_thread],
+                                            d_rem_carrier_phase_rad,
+                                            d_carrier_phase_step_rad,
+                                            d_code_phase_step_chips,
+                                            d_rem_code_phase_chips,
+                                            correlation_sizes[correlation_sizes_idx]));
+                                }
+                            //wait the threads to finish they work and destroy the thread objects
+                            for(auto &t : thread_pool)
+                                {
+                                    t.join();
+                                }
+                            thread_pool.clear();
+                            end = std::chrono::system_clock::now();
+                            elapsed_seconds = end - start;
+                            execution_times[correlation_sizes_idx] = elapsed_seconds.count() / static_cast<double>(FLAGS_cpu_multicorrelator_real_codes_iterations_test);
+                            std::cout << "CPU Multicorrelator (real codes) execution time for length=" << correlation_sizes[correlation_sizes_idx]
+                                      << " : " << execution_times[correlation_sizes_idx] << " [s]" << std::endl;
+
+                        }
+                }
+    );
+
+    volk_gnsssdr_free(d_local_code_shift_chips);
+    volk_gnsssdr_free(d_correlator_outs);
+    volk_gnsssdr_free(d_ca_code);
+    volk_gnsssdr_free(in_cpu);
+
+    for (int n = 0; n< max_threads; n++)
+        {
+            correlator_pool[n]->free();
+        }
+}
+
diff --git a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc
index 11ea1ca58..661b0a19e 100644
--- a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc
+++ b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc
@@ -33,6 +33,9 @@
 #include <complex>
 #include <random>
 #include <thread>
+#include <gtest/gtest.h>
+#include <gflags/gflags.h>
+#include <gnuradio/gr_complex.h>
 #include <volk_gnsssdr/volk_gnsssdr.h>
 #include "cpu_multicorrelator.h"
 #include "gps_sdr_signal_processing.h"

From 9ec555814318a41de03650fe72004078f9791d2c Mon Sep 17 00:00:00 2001
From: Cillian O'Driscoll <cillian.odriscoll@gmail.com>
Date: Mon, 11 Sep 2017 15:22:32 +0100
Subject: [PATCH 3/4] Use cpu_multicorrelator_real_codes

For galileo_e1_dll_pll_veml_tracking_cc and gps_l1_ca_dll_pll_tracking_cc

Note this gives some significant performance improvement for higher
sampling rates
---
 .../galileo_e1_dll_pll_veml_tracking_cc.cc                | 6 +++---
 .../gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h | 8 ++++----
 .../gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc      | 4 ++--
 .../gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h       | 7 ++++---
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc
index d2319945c..9c23835dc 100755
--- a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc
@@ -11,7 +11,7 @@
  *
  * -------------------------------------------------------------------------
  *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ * Copyright (C) 2010-2017  (see AUTHORS file for a list of contributors)
  *
  * GNSS-SDR is a software defined Global Navigation
  *          Satellite Systems receiver
@@ -129,7 +129,7 @@ galileo_e1_dll_pll_veml_tracking_cc::galileo_e1_dll_pll_veml_tracking_cc(
 
     // Initialization of local code replica
     // Get space for a vector with the sinboc(1,1) replica sampled 2x/chip
-    d_ca_code = static_cast<gr_complex*>(volk_gnsssdr_malloc((2 * Galileo_E1_B_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_gnsssdr_get_alignment()));
+    d_ca_code = static_cast<float*>(volk_gnsssdr_malloc((2 * Galileo_E1_B_CODE_LENGTH_CHIPS) * sizeof(float), volk_gnsssdr_get_alignment()));
 
     // correlator outputs (scalar)
     d_n_correlator_taps = 5; // Very-Early, Early, Prompt, Late, Very-Late
@@ -211,7 +211,7 @@ void galileo_e1_dll_pll_veml_tracking_cc::start_tracking()
     d_code_loop_filter.initialize();    // initialize the code filter
 
     // generate local reference ALWAYS starting at chip 1 (2 samples per chip)
-    galileo_e1_code_gen_complex_sampled(d_ca_code,
+    galileo_e1_code_gen_float_sampled(d_ca_code,
                                         d_acquisition_gnss_synchro->Signal,
                                         false,
                                         d_acquisition_gnss_synchro->PRN,
diff --git a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h
index 3377234cf..09e6fffe6 100755
--- a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h
@@ -6,7 +6,7 @@
  *
  * -------------------------------------------------------------------------
  *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ * Copyright (C) 2010-2017  (see AUTHORS file for a list of contributors)
  *
  * GNSS-SDR is a software defined Global Navigation
  *          Satellite Systems receiver
@@ -39,7 +39,7 @@
 #include "gnss_synchro.h"
 #include "tracking_2nd_DLL_filter.h"
 #include "tracking_2nd_PLL_filter.h"
-#include "cpu_multicorrelator.h"
+#include "cpu_multicorrelator_real_codes.h"
 
 class galileo_e1_dll_pll_veml_tracking_cc;
 
@@ -120,10 +120,10 @@ private:
     double d_early_late_spc_chips;
     double d_very_early_late_spc_chips;
 
-    gr_complex* d_ca_code;
+    float* d_ca_code;
     float* d_local_code_shift_chips;
     gr_complex* d_correlator_outs;
-    cpu_multicorrelator multicorrelator_cpu;
+    cpu_multicorrelator_real_codes multicorrelator_cpu;
 
     gr_complex *d_Very_Early;
     gr_complex *d_Early;
diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
index 46b191d8c..903aea0d9 100644
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
@@ -123,7 +123,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_cc::Gps_L1_Ca_Dll_Pll_Tracking_cc(
 
     // Initialization of local code replica
     // Get space for a vector with the C/A code replica sampled 1x/chip
-    d_ca_code = static_cast<gr_complex*>(volk_gnsssdr_malloc(static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_gnsssdr_get_alignment()));
+    d_ca_code = static_cast<float*>(volk_gnsssdr_malloc(static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(float), volk_gnsssdr_get_alignment()));
 
     // correlator outputs (scalar)
     d_n_correlator_taps = 3; // Early, Prompt, and Late
@@ -233,7 +233,7 @@ void Gps_L1_Ca_Dll_Pll_Tracking_cc::start_tracking()
     d_code_loop_filter.initialize();    // initialize the code filter
 
     // generate local reference ALWAYS starting at chip 1 (1 sample per chip)
-    gps_l1_ca_code_gen_complex(d_ca_code, d_acquisition_gnss_synchro->PRN, 0);
+    gps_l1_ca_code_gen_float(d_ca_code, d_acquisition_gnss_synchro->PRN, 0);
 
     multicorrelator_cpu.set_local_code_and_taps(static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS), d_ca_code, d_local_code_shift_chips);
     for (int n = 0; n < d_n_correlator_taps; n++)
diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h
index 92a5d4afb..da408d7d7 100644
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h
@@ -3,6 +3,7 @@
  * \brief Interface of a code DLL + carrier PLL tracking block
  * \author Carlos Aviles, 2010. carlos.avilesr(at)googlemail.com
  *         Javier Arribas, 2011. jarribas(at)cttc.es
+ *         Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com
  *
  * Code DLL + carrier PLL according to the algorithms described in:
  * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
@@ -44,7 +45,7 @@
 #include "gnss_synchro.h"
 #include "tracking_2nd_DLL_filter.h"
 #include "tracking_2nd_PLL_filter.h"
-#include "cpu_multicorrelator.h"
+#include "cpu_multicorrelator_real_codes.h"
 
 class Gps_L1_Ca_Dll_Pll_Tracking_cc;
 
@@ -126,10 +127,10 @@ private:
     double d_acq_carrier_doppler_hz;
     // correlator
     int d_n_correlator_taps;
-    gr_complex* d_ca_code;
+    float* d_ca_code;
     float* d_local_code_shift_chips;
     gr_complex* d_correlator_outs;
-    cpu_multicorrelator multicorrelator_cpu;
+    cpu_multicorrelator_real_codes multicorrelator_cpu;
 
 
     // tracking vars

From 94dfef74c1f25f1f64c7838fb17f1e4be526fff3 Mon Sep 17 00:00:00 2001
From: Carles Fernandez <carles.fernandez@gmail.com>
Date: Sat, 16 Sep 2017 01:14:15 +0200
Subject: [PATCH 4/4] Add cpu_multicorrelator_real_codes_test

and minor cosmetics
---
 .../libs/galileo_e1_signal_processing.cc      | 55 ++++++++-----------
 .../libs/gps_sdr_signal_processing.cc         | 16 +++---
 .../gps_l1_ca_dll_pll_tracking_cc.cc          | 12 ++--
 .../gps_l1_ca_dll_pll_tracking_cc.h           |  1 -
 src/tests/test_main.cc                        |  1 +
 .../cpu_multicorrelator_real_codes_test.cc    |  7 +--
 6 files changed, 42 insertions(+), 50 deletions(-)

diff --git a/src/algorithms/libs/galileo_e1_signal_processing.cc b/src/algorithms/libs/galileo_e1_signal_processing.cc
index 46738faad..2ea720ec8 100644
--- a/src/algorithms/libs/galileo_e1_signal_processing.cc
+++ b/src/algorithms/libs/galileo_e1_signal_processing.cc
@@ -55,7 +55,6 @@ void galileo_e1_code_gen_int(int* _dest, char _Signal[3], signed int _prn)
                     hex_to_binary_converter(&_dest[index], Galileo_E1_B_PRIMARY_CODE[prn].at(i));
                     index = index + 4;
                 }
-
         }
     else if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2)
         {
@@ -72,7 +71,6 @@ void galileo_e1_code_gen_int(int* _dest, char _Signal[3], signed int _prn)
 }
 
 
-
 void galileo_e1_sinboc_11_gen_int(int* _dest, int* _prn, unsigned int _length_out)
 {
     const unsigned int _length_in = Galileo_E1_B_CODE_LENGTH_CHIPS;
@@ -91,7 +89,6 @@ void galileo_e1_sinboc_11_gen_int(int* _dest, int* _prn, unsigned int _length_ou
 }
 
 
-
 void galileo_e1_sinboc_61_gen_int(int* _dest, int* _prn, unsigned int _length_out)
 {
     const unsigned int _length_in = Galileo_E1_B_CODE_LENGTH_CHIPS;
@@ -111,7 +108,6 @@ void galileo_e1_sinboc_61_gen_int(int* _dest, int* _prn, unsigned int _length_ou
 }
 
 
-
 void galileo_e1_gen_float(float* _dest, int* _prn, char _Signal[3])
 {
     std::string _galileo_signal = _Signal;
@@ -160,8 +156,8 @@ void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3],
     const int _samplesPerChip = (_cboc == true) ? 12 : 2;
 
     const unsigned int delay = ((static_cast<int>(Galileo_E1_B_CODE_LENGTH_CHIPS) - _chip_shift)
-                                % static_cast<int>(Galileo_E1_B_CODE_LENGTH_CHIPS))
-                                * _samplesPerCode / Galileo_E1_B_CODE_LENGTH_CHIPS;
+            % static_cast<int>(Galileo_E1_B_CODE_LENGTH_CHIPS))
+            * _samplesPerCode / Galileo_E1_B_CODE_LENGTH_CHIPS;
 
     galileo_e1_code_gen_int(primary_code_E1_chips, _Signal, _prn); //generate Galileo E1 code, 1 sample per chip
 
@@ -195,29 +191,24 @@ void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3],
             _signal_E1 = _resampled_signal;
         }
 
-
     if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2 && _secondary_flag)
-    {
+        {
+            float* _signal_E1C_secondary = new float[static_cast<int>(Galileo_E1_C_SECONDARY_CODE_LENGTH) * _samplesPerCode];
 
-        float* _signal_E1C_secondary = new float[
-            static_cast<int>(Galileo_E1_C_SECONDARY_CODE_LENGTH)
-                                                    * _samplesPerCode];
+            for (unsigned int i = 0; i < static_cast<unsigned int>(Galileo_E1_C_SECONDARY_CODE_LENGTH); i++)
+                {
+                    for (unsigned k = 0; k < _samplesPerCode; k++)
+                        {
+                            _signal_E1C_secondary[i*_samplesPerCode + k] = _signal_E1[k]
+                                 * (Galileo_E1_C_SECONDARY_CODE.at(i) == '0' ? 1.0f : -1.0f);
+                        }
+                }
 
-        for (unsigned int i = 0; i < static_cast<unsigned int>(Galileo_E1_C_SECONDARY_CODE_LENGTH); i++)
-            {
-                for (unsigned k = 0; k < _samplesPerCode; k++)
-                    {
-                        _signal_E1C_secondary[i*_samplesPerCode + k] = _signal_E1[k]
-                                * (Galileo_E1_C_SECONDARY_CODE.at(i) == '0'
-                                   ? 1.0f : -1.0f);
-                    }
-            }
+            _samplesPerCode *= static_cast<int>(Galileo_E1_C_SECONDARY_CODE_LENGTH);
 
-        _samplesPerCode *= static_cast<int>(Galileo_E1_C_SECONDARY_CODE_LENGTH);
-
-        delete[] _signal_E1;
-        _signal_E1 = _signal_E1C_secondary;
-    }
+            delete[] _signal_E1;
+            _signal_E1 = _signal_E1C_secondary;
+        }
 
     for (unsigned int i = 0; i < _samplesPerCode; i++)
         {
@@ -227,20 +218,20 @@ void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3],
     delete[] _signal_E1;
 }
 
+
 void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signal[3],
         bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift,
         bool _secondary_flag)
 {
     std::string _galileo_signal = _Signal;
     const int _codeFreqBasis = Galileo_E1_CODE_CHIP_RATE_HZ; //Hz
-    unsigned int _samplesPerCode = static_cast<unsigned int>( 
-            static_cast<double>(_fs) / (static_cast<double>(_codeFreqBasis ) 
-                / static_cast<double>(Galileo_E1_B_CODE_LENGTH_CHIPS)));
+    unsigned int _samplesPerCode = static_cast<unsigned int>( static_cast<double>(_fs) /
+            (static_cast<double>(_codeFreqBasis )  / static_cast<double>(Galileo_E1_B_CODE_LENGTH_CHIPS)));
 
     if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2 && _secondary_flag)
-    {
-        _samplesPerCode *= static_cast<int>(Galileo_E1_C_SECONDARY_CODE_LENGTH);
-    }
+        {
+            _samplesPerCode *= static_cast<int>(Galileo_E1_C_SECONDARY_CODE_LENGTH);
+        }
 
     float real_code[_samplesPerCode];
 
@@ -252,12 +243,14 @@ void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signa
         }
 }
 
+
 void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3],
         bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift)
 {
     galileo_e1_code_gen_float_sampled(_dest, _Signal, _cboc, _prn, _fs, _chip_shift, false);
 }
 
+
 void galileo_e1_code_gen_complex_sampled(std::complex<float>* _dest, char _Signal[3],
         bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift)
 {
diff --git a/src/algorithms/libs/gps_sdr_signal_processing.cc b/src/algorithms/libs/gps_sdr_signal_processing.cc
index 168f8af97..1ddb8976a 100644
--- a/src/algorithms/libs/gps_sdr_signal_processing.cc
+++ b/src/algorithms/libs/gps_sdr_signal_processing.cc
@@ -113,6 +113,7 @@ void gps_l1_ca_code_gen_int(int* _dest, signed int _prn, unsigned int _chip_shif
         }
 }
 
+
 void gps_l1_ca_code_gen_float(float* _dest, signed int _prn, unsigned int _chip_shift)
 {
     unsigned int _code_length = 1023;
@@ -121,12 +122,12 @@ void gps_l1_ca_code_gen_float(float* _dest, signed int _prn, unsigned int _chip_
     gps_l1_ca_code_gen_int( ca_code_int, _prn, _chip_shift );
 
     for( unsigned int ii = 0; ii < _code_length; ++ii )
-    {
-        _dest[ii] = static_cast<float>( ca_code_int[ii] );
-    }
-
+        {
+            _dest[ii] = static_cast<float>( ca_code_int[ii] );
+        }
 }
 
+
 void gps_l1_ca_code_gen_complex(std::complex<float>* _dest, signed int _prn, unsigned int _chip_shift)
 {
     unsigned int _code_length = 1023;
@@ -135,10 +136,9 @@ void gps_l1_ca_code_gen_complex(std::complex<float>* _dest, signed int _prn, uns
     gps_l1_ca_code_gen_int( ca_code_int, _prn, _chip_shift );
 
     for( unsigned int ii = 0; ii < _code_length; ++ii )
-    {
-        _dest[ii] = std::complex< float >( static_cast<float>(ca_code_int[ii]), 0.0f );
-    }
-
+        {
+            _dest[ii] = std::complex<float>( static_cast<float>(ca_code_int[ii]), 0.0f );
+        }
 }
 
 
diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
index 903aea0d9..32d96c079 100644
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
@@ -127,12 +127,12 @@ Gps_L1_Ca_Dll_Pll_Tracking_cc::Gps_L1_Ca_Dll_Pll_Tracking_cc(
 
     // correlator outputs (scalar)
     d_n_correlator_taps = 3; // Early, Prompt, and Late
-    d_correlator_outs = static_cast<gr_complex*>(volk_gnsssdr_malloc(d_n_correlator_taps*sizeof(gr_complex), volk_gnsssdr_get_alignment()));
+    d_correlator_outs = static_cast<gr_complex*>(volk_gnsssdr_malloc(d_n_correlator_taps * sizeof(gr_complex), volk_gnsssdr_get_alignment()));
     for (int n = 0; n < d_n_correlator_taps; n++)
-    {
-        d_correlator_outs[n] = gr_complex(0,0);
-    }
-    d_local_code_shift_chips = static_cast<float*>(volk_gnsssdr_malloc(d_n_correlator_taps*sizeof(float), volk_gnsssdr_get_alignment()));
+        {
+            d_correlator_outs[n] = gr_complex(0,0);
+        }
+    d_local_code_shift_chips = static_cast<float*>(volk_gnsssdr_malloc(d_n_correlator_taps * sizeof(float), volk_gnsssdr_get_alignment()));
     // Set TAPs delay values [chips]
     d_local_code_shift_chips[0] = - d_early_late_spc_chips;
     d_local_code_shift_chips[1] = 0.0;
@@ -194,7 +194,7 @@ void Gps_L1_Ca_Dll_Pll_Tracking_cc::start_tracking()
     long int acq_trk_diff_samples;
     double acq_trk_diff_seconds;
     acq_trk_diff_samples = static_cast<long int>(d_sample_counter) - static_cast<long int>(d_acq_sample_stamp); //-d_vector_length;
-    DLOG(INFO) << "Number of samples between Acquisition and Tracking =" << acq_trk_diff_samples;
+    DLOG(INFO) << "Number of samples between Acquisition and Tracking = " << acq_trk_diff_samples;
     acq_trk_diff_seconds = static_cast<float>(acq_trk_diff_samples) / static_cast<float>(d_fs_in);
     // Doppler effect
     // Fd=(C/(C+Vr))*F
diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h
index da408d7d7..9fe80fec7 100644
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h
@@ -132,7 +132,6 @@ private:
     gr_complex* d_correlator_outs;
     cpu_multicorrelator_real_codes multicorrelator_cpu;
 
-
     // tracking vars
     double d_code_freq_chips;
     double d_code_phase_step_chips;
diff --git a/src/tests/test_main.cc b/src/tests/test_main.cc
index dfdb5f3ea..bae669086 100644
--- a/src/tests/test_main.cc
+++ b/src/tests/test_main.cc
@@ -110,6 +110,7 @@ DECLARE_string(log_dir);
 #include "unit-tests/signal-processing-blocks/tracking/galileo_e5a_tracking_test.cc"
 #include "unit-tests/signal-processing-blocks/tracking/tracking_loop_filter_test.cc"
 #include "unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc"
+#include "unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc"
 
 #if CUDA_BLOCKS_TEST
 #include "unit-tests/signal-processing-blocks/tracking/gpu_multicorrelator_test.cc"
diff --git a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc
index 100cc6985..ef3c2a293 100644
--- a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc
+++ b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc
@@ -45,7 +45,7 @@
 DEFINE_int32(cpu_multicorrelator_real_codes_iterations_test, 1000, "Number of averaged iterations in CPU multicorrelator test timing test");
 DEFINE_int32(cpu_multicorrelator_real_codes_max_threads_test, 12, "Number of maximum concurrent correlators in CPU multicorrelator test timing test");
 
-void run_correlator_cpu(cpu_multicorrelator_real_codes* correlator,
+void run_correlator_cpu_real_codes(cpu_multicorrelator_real_codes* correlator,
                     float d_rem_carrier_phase_rad,
                     float d_carrier_phase_step_rad,
                     float d_code_phase_step_chips,
@@ -137,7 +137,7 @@ TEST(CpuMulticorrelatorRealCodesTest, MeasureExecutionTime)
                             //create the concurrent correlator threads
                             for (int current_thread = 0; current_thread < current_max_threads; current_thread++)
                                 {
-                                    thread_pool.push_back(std::thread(run_correlator_cpu,
+                                    thread_pool.push_back(std::thread(run_correlator_cpu_real_codes,
                                             correlator_pool[current_thread],
                                             d_rem_carrier_phase_rad,
                                             d_carrier_phase_step_rad,
@@ -156,7 +156,6 @@ TEST(CpuMulticorrelatorRealCodesTest, MeasureExecutionTime)
                             execution_times[correlation_sizes_idx] = elapsed_seconds.count() / static_cast<double>(FLAGS_cpu_multicorrelator_real_codes_iterations_test);
                             std::cout << "CPU Multicorrelator (real codes) execution time for length=" << correlation_sizes[correlation_sizes_idx]
                                       << " : " << execution_times[correlation_sizes_idx] << " [s]" << std::endl;
-
                         }
                 }
     );
@@ -166,7 +165,7 @@ TEST(CpuMulticorrelatorRealCodesTest, MeasureExecutionTime)
     volk_gnsssdr_free(d_ca_code);
     volk_gnsssdr_free(in_cpu);
 
-    for (int n = 0; n< max_threads; n++)
+    for (int n = 0; n < max_threads; n++)
         {
             correlator_pool[n]->free();
         }