Merge branch 'next' of https://github.com/carlesfernandez/gnss-sdr into next

2025-10-30 23:03:05 +00:00 · 2016-03-18 20:44:33 +01:00
parent 618099e62a 1703be7612
commit a7fbc46807
18 changed files with 1260 additions and 1179 deletions
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/CMakeLists.txt
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/CMakeLists.txt
@@ -170,6 +170,7 @@ install(FILES
    ${PROJECT_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_common.h
    ${PROJECT_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h
    ${PROJECT_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h
+    ${PROJECT_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h
    ${PROJECT_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr.h
    ${PROJECT_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_cpu.h
    ${PROJECT_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_config_fixed.h
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h
@@ -0,0 +1,56 @@
+/*!
+ * \file volk_gnsssdr_neon_intrinsics.h
+ * \author Carles Fernandez, 2016. carles.fernandez(at)gmail.com
+ * \brief Holds NEON intrinsics of intrinsics.
+ * They can be used in VOLK_GNSSSDR kernels to avoid copy-paste
+ *
+ * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_
+#define INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_
+
+#include <arm_neon.h>
+
+static inline float32x4_t vdivq_f32( float32x4_t num, float32x4_t den )
+{
+    const float32x4_t q_inv0 = vrecpeq_f32( den );
+    const float32x4_t q_step0 = vrecpsq_f32( q_inv0, den );
+
+    const float32x4_t q_inv1 = vmulq_f32( q_step0, q_inv0 );
+    return vmulq_f32( num, q_inv1 );
+}
+
+
+static inline float32x4_t vsqrtq_f32( float32x4_t q_x )
+{
+    const float32x4_t q_step_0 = vrsqrteq_f32( q_x );
+    // step
+    const float32x4_t q_step_parm0 = vmulq_f32( q_x, q_step_0 );
+    const float32x4_t q_step_result0 = vrsqrtsq_f32( q_step_parm0, q_step_0 );
+    // step
+    const float32x4_t q_step_1 = vmulq_f32( q_step_0, q_step_result0 );
+    const float32x4_t q_step_parm1 = vmulq_f32( q_x, q_step_1 );
+    const float32x4_t q_step_result1 = vrsqrtsq_f32( q_step_parm1, q_step_1 );
+    // take the res
+    const float32x4_t q_step_2 = vmulq_f32( q_step_1, q_step_result1 );
+    // mul by x to get sqrt, not rsqrt
+    return vmulq_f32( q_x, q_step_2 );
+}
+
+#endif /* INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_ */
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_rotatorpuppet_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_rotatorpuppet_16ic.h
@@ -57,6 +57,22 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_generic(lv_16sc_t* outVe
 #endif /* LV_HAVE_GENERIC */


+#ifdef LV_HAVE_GENERIC
+static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_generic_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.123;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
+    volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(outVector, inVector, phase_inc[0], phase, num_points);
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
 {
@@ -73,6 +89,22 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse3(lv_16sc_t* outVec
 #endif /* LV_HAVE_SSE3 */


+#ifdef LV_HAVE_SSE3
+static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_a_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.123;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
+    volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(outVector, inVector, phase_inc[0], phase, num_points);
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+
 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse3(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
 {
@@ -89,6 +121,22 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse3(lv_16sc_t* outVec
 #endif /* LV_HAVE_SSE3 */


+#ifdef LV_HAVE_SSE3
+static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.123;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
+    volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(outVector, inVector, phase_inc[0], phase, num_points);
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+
 #ifdef LV_HAVE_NEON
 static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_neon(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
 {
@@ -105,4 +153,20 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_neon(lv_16sc_t* outVecto
 #endif /* LV_HAVE_NEON */


+#ifdef LV_HAVE_NEON
+static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_neon_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.123;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), -sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), -sin(phase_step_rad));
+    volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(outVector, inVector, phase_inc[0], phase, num_points);
+}
+
+#endif /* LV_HAVE_NEON */
+
+
 #endif /* INCLUDED_volk_gnsssdr_16ic_rotatorpuppet_16ic_H */
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h
@@ -61,6 +61,7 @@

 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <math.h>
+//#include <stdio.h>


 #ifdef LV_HAVE_GENERIC
@@ -71,6 +72,56 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
    lv_16sc_t tmp16;
    lv_32fc_t tmp32;
    for(i = 0; i < (unsigned int)(num_points); ++i)
+        {
+            tmp16 = *inVector++;
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            *outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            (*phase) *= phase_inc;
+            // Regenerate phase
+            if (i % 512 == 0)
+                {
+                    //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+#ifdef __cplusplus
+                    (*phase) /= std::abs((*phase));
+#else
+                    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+#endif
+                    //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+                }
+        }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+{
+    unsigned int ROTATOR_RELOAD = 512;
+    unsigned int n = 0;
+    unsigned int j = 0;
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+    for (; n < num_points / ROTATOR_RELOAD; n++)
+        {
+            for (j = 0; j < ROTATOR_RELOAD; j++)
+                {
+                    tmp16 = *inVector++;
+                    tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+                    *outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+                    (*phase) *= phase_inc;
+                }
+            // Regenerate phase
+            //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+#ifdef __cplusplus
+            (*phase) /= std::abs((*phase));
+#else
+            (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+#endif
+            //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+        }
+    for (j = 0; j < num_points % ROTATOR_RELOAD; j++)
        {
            tmp16 = *inVector++;
            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
@@ -130,6 +181,168 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
            //next two samples
            _in += 2;
            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            __builtin_prefetch(_in + 8);
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            // store four output samples
+            result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
+            _mm_store_si128((__m128i*)_out, result);
+
+            // Regenerate phase
+            if ((number % 512) == 0)
+                {
+                    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+                    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+                    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+                    tmp2 = _mm_sqrt_ps(tmp1);
+                    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+                }
+
+            //next two samples
+            _in += 2;
+            _out += 4;
+        }
+
+    _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
+    (*phase) = two_phase_acc[0];
+
+    for (unsigned int i = sse_iters * 4; i < num_points; ++i)
+        {
+            tmp16 = *_in++;
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            (*phase) *= phase_inc;
+        }
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 4;
+    const unsigned int ROTATOR_RELOAD = 512;
+    __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
+    __m128i c1, c2, result;
+    __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
+    two_phase_inc[0] = phase_inc * phase_inc;
+    two_phase_inc[1] = phase_inc * phase_inc;
+    two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+    __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
+    two_phase_acc[0] = (*phase);
+    two_phase_acc[1] = (*phase) * phase_inc;
+    two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
+
+    const lv_16sc_t* _in = inVector;
+
+    lv_16sc_t* _out = outVector;
+
+    __m128 yl, yh, tmp1, tmp2, tmp3;
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+
+    for (unsigned int n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
+        {
+            for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
+                {
+                    a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+                    //complex 32fc multiplication b=a*two_phase_acc_reg
+                    yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+                    c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
+
+                    //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+                    yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+                    //next two samples
+                    _in += 2;
+                    a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+                    __builtin_prefetch(_in + 8);
+                    //complex 32fc multiplication b=a*two_phase_acc_reg
+                    yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+                    c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
+
+                    //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+                    yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+                    // store four output samples
+                    result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
+                    _mm_store_si128((__m128i*)_out, result);
+
+                    //next two samples
+                    _in += 2;
+                    _out += 4;
+                }
+            // Regenerate phase
+            tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+            tmp2 = _mm_hadd_ps(tmp1, tmp1);
+            tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+            tmp2 = _mm_sqrt_ps(tmp1);
+            two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+        }
+
+    for (unsigned int j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
+        {
+            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            //next two samples
+            _in += 2;
+            a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            __builtin_prefetch(_in + 8);
            //complex 32fc multiplication b=a*two_phase_acc_reg
            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
@@ -171,6 +384,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
 #endif /* LV_HAVE_SSE3 */


+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>

@@ -241,6 +455,16 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
            result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
            _mm_storeu_si128((__m128i*)_out, result);

+            // Regenerate phase
+            if ((number % 512) == 0)
+                {
+                    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+                    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+                    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+                    tmp2 = _mm_sqrt_ps(tmp1);
+                    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+                }
+
            //next two samples
            _in += 2;
            _out += 4;
@@ -261,6 +485,156 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
 #endif /* LV_HAVE_SSE3 */


+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+{
+    const unsigned int sse_iters = num_points / 4;
+     unsigned int ROTATOR_RELOAD = 512;
+     __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
+     __m128i c1, c2, result;
+     __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
+     two_phase_inc[0] = phase_inc * phase_inc;
+     two_phase_inc[1] = phase_inc * phase_inc;
+     two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+     __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
+     two_phase_acc[0] = (*phase);
+     two_phase_acc[1] = (*phase) * phase_inc;
+     two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
+
+     const lv_16sc_t* _in = inVector;
+
+     lv_16sc_t* _out = outVector;
+
+     __m128 yl, yh, tmp1, tmp2, tmp3;
+     lv_16sc_t tmp16;
+     lv_32fc_t tmp32;
+
+     for (unsigned int n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
+         {
+             for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
+                 {
+                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+                     //complex 32fc multiplication b=a*two_phase_acc_reg
+                     yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                     yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                     tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                     a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                     tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                     b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+                     c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
+
+                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+                     yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                     yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                     tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                     tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+                     //next two samples
+                     _in += 2;
+                     a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+                     __builtin_prefetch(_in + 8);
+                     //complex 32fc multiplication b=a*two_phase_acc_reg
+                     yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                     yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                     tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                     a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                     tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                     b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+                     c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
+
+                     //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+                     yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                     yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                     tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                     tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                     tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                     two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+                     // store four output samples
+                     result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
+                     _mm_storeu_si128((__m128i*)_out, result);
+
+                     //next two samples
+                     _in += 2;
+                     _out += 4;
+                 }
+             // Regenerate phase
+             tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+             tmp2 = _mm_hadd_ps(tmp1, tmp1);
+             tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+             tmp2 = _mm_sqrt_ps(tmp1);
+             two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+         }
+
+     for (unsigned int j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
+         {
+             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+             //complex 32fc multiplication b=a*two_phase_acc_reg
+             yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+             yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+             tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+             a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+             tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+             b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+             c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
+
+             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+             yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+             yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+             tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+             tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+             //next two samples
+             _in += 2;
+             a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+             __builtin_prefetch(_in + 8);
+             //complex 32fc multiplication b=a*two_phase_acc_reg
+             yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+             yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+             tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+             a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
+             tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+             b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+             c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
+
+             //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+             yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+             yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+             tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+             tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+             tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+             two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+             // store four output samples
+             result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
+             _mm_storeu_si128((__m128i*)_out, result);
+
+             //next two samples
+             _in += 2;
+             _out += 4;
+         }
+
+     _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
+     (*phase) = two_phase_acc[0];
+
+     for (unsigned int i = sse_iters * 4; i < num_points; ++i)
+         {
+             tmp16 = *_in++;
+             tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+             *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+             (*phase) *= phase_inc;
+         }
+}
+
+#endif /* LV_HAVE_SSE3 */
+
+
 #ifdef LV_HAVE_NEON
 #include <arm_neon.h>

@@ -271,6 +645,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
    lv_16sc_t tmp16_;
    lv_32fc_t tmp32_;

+    float arg_phase0 = cargf(*phase);
+    float arg_phase_inc = cargf(phase_inc);
+    float phase_est = 0.0;
+
    const lv_16sc_t* _in = inVector;
    lv_16sc_t* _out = outVector;

@@ -351,6 +729,24 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
                    /* store the four complex results */
                    vst2_s16((int16_t*)_out, tmp16);
                    _out += 4;
+                    // Regenerate phase
+                    if ((i % 512) == 0)
+                        {
+                            //printf("Computed phase:  %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
+                            phase_est = arg_phase0 + (i + 1) * 4 * arg_phase_inc;
+                            //printf("Estimated phase: %f\n\n", cos(phase_est));
+
+                            *phase = lv_cmake(cos(phase_est), sin(phase_est));
+                            phase2 = (lv_32fc_t)(*phase) * phase_inc;
+                            phase3 = phase2 * phase_inc;
+                            phase4 = phase3 * phase_inc;
+
+                            __VOLK_ATTR_ALIGNED(16) float32_t  ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+                            __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+                            _phase_real = vld1q_f32(____phase_real);
+                            _phase_imag = vld1q_f32(____phase_imag);
+                        }
                }
            vst1q_f32((float32_t*)__phase_real, _phase_real);
            vst1q_f32((float32_t*)__phase_imag, _phase_imag);
@@ -368,4 +764,190 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe

 #endif /* LV_HAVE_NEON */

+
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
+{
+    unsigned int i = 0;
+    const unsigned int neon_iters = num_points / 4;
+    const unsigned int ROTATOR_RELOAD = 512;
+
+    lv_16sc_t tmp16_;
+    lv_32fc_t tmp32_;
+
+    float arg_phase0 = cargf(*phase);
+    float arg_phase_inc = cargf(phase_inc);
+    float phase_est = 0.0;
+
+    const lv_16sc_t* _in = inVector;
+    lv_16sc_t* _out = outVector;
+
+    lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
+    __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
+    __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
+
+    float32x4_t _phase4_real = vld1q_f32(__phase4_real);
+    float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
+
+    lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc;
+    lv_32fc_t phase3 = phase2 * phase_inc;
+    lv_32fc_t phase4 = phase3 * phase_inc;
+
+    __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+    __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+    float32x4_t _phase_real = vld1q_f32(__phase_real);
+    float32x4_t _phase_imag = vld1q_f32(__phase_imag);
+
+    float32x4_t half = vdupq_n_f32(0.5f);
+    int16x4x2_t tmp16;
+    int32x4x2_t tmp32i;
+    float32x4x2_t tmp32f, tmp_real, tmp_imag;
+    float32x4_t sign, PlusHalf, Round;
+
+    if (neon_iters > 0)
+        {
+            for (unsigned int n = 0; n < neon_iters / ROTATOR_RELOAD; n++)
+                {
+                    for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
+                        {
+                            /* load 4 complex numbers (int 16 bits each component) */
+                            tmp16 = vld2_s16((int16_t*)_in);
+                            __builtin_prefetch(_in + 8);
+                            _in += 4;
+
+                            /* promote them to int 32 bits */
+                            tmp32i.val[0] = vmovl_s16(tmp16.val[0]);
+                            tmp32i.val[1] = vmovl_s16(tmp16.val[1]);
+
+                            /* promote them to float 32 bits */
+                            tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]);
+                            tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]);
+
+                            /* complex multiplication of four complex samples (float 32 bits each component) */
+                            tmp_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real);
+                            tmp_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag);
+                            tmp_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag);
+                            tmp_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real);
+
+                            tmp32f.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+                            tmp32f.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+
+                            /* downcast results to int32 */
+                            /* in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); */
+                            sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31)));
+                            PlusHalf = vaddq_f32(tmp32f.val[0], half);
+                            Round = vsubq_f32(PlusHalf, sign);
+                            tmp32i.val[0] = vcvtq_s32_f32(Round);
+
+                            sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31)));
+                            PlusHalf = vaddq_f32(tmp32f.val[1], half);
+                            Round = vsubq_f32(PlusHalf, sign);
+                            tmp32i.val[1] = vcvtq_s32_f32(Round);
+
+                            /* downcast results to int16 */
+                            tmp16.val[0] = vqmovn_s32(tmp32i.val[0]);
+                            tmp16.val[1] = vqmovn_s32(tmp32i.val[1]);
+
+                            /* compute next four phases */
+                            tmp_real.val[0] = vmulq_f32(_phase_real, _phase4_real);
+                            tmp_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag);
+                            tmp_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag);
+                            tmp_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real);
+
+                            _phase_real = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+                            _phase_imag = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+
+                            /* store the four complex results */
+                            vst2_s16((int16_t*)_out, tmp16);
+                            _out += 4;
+                        }
+                    // Regenerate phase
+                    //printf("Computed phase:  %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
+                    phase_est = arg_phase0 + (n + 1) * ROTATOR_RELOAD * 4 * arg_phase_inc;
+                    //printf("Estimated phase: %f\n\n", cos(phase_est));
+                    *phase = lv_cmake(cos(phase_est), sin(phase_est));
+                    phase2 = (lv_32fc_t)(*phase) * phase_inc;
+                    phase3 = phase2 * phase_inc;
+                    phase4 = phase3 * phase_inc;
+
+                    __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+                    __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+                    _phase_real = vld1q_f32(____phase_real);
+                    _phase_imag = vld1q_f32(____phase_imag);
+                }
+
+            for (unsigned int j = 0; j < neon_iters % ROTATOR_RELOAD; j++)
+                {
+                    /* load 4 complex numbers (int 16 bits each component) */
+                    tmp16 = vld2_s16((int16_t*)_in);
+                    __builtin_prefetch(_in + 8);
+                    _in += 4;
+
+                    /* promote them to int 32 bits */
+                    tmp32i.val[0] = vmovl_s16(tmp16.val[0]);
+                    tmp32i.val[1] = vmovl_s16(tmp16.val[1]);
+
+                    /* promote them to float 32 bits */
+                    tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]);
+                    tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]);
+
+                    /* complex multiplication of four complex samples (float 32 bits each component) */
+                    tmp_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real);
+                    tmp_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag);
+                    tmp_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag);
+                    tmp_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real);
+
+                    tmp32f.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+                    tmp32f.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+
+                    /* downcast results to int32 */
+                    /* in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); */
+                    sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31)));
+                    PlusHalf = vaddq_f32(tmp32f.val[0], half);
+                    Round = vsubq_f32(PlusHalf, sign);
+                    tmp32i.val[0] = vcvtq_s32_f32(Round);
+
+                    sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31)));
+                    PlusHalf = vaddq_f32(tmp32f.val[1], half);
+                    Round = vsubq_f32(PlusHalf, sign);
+                    tmp32i.val[1] = vcvtq_s32_f32(Round);
+
+                    /* downcast results to int16 */
+                    tmp16.val[0] = vqmovn_s32(tmp32i.val[0]);
+                    tmp16.val[1] = vqmovn_s32(tmp32i.val[1]);
+
+                    /* compute next four phases */
+                    tmp_real.val[0] = vmulq_f32(_phase_real, _phase4_real);
+                    tmp_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag);
+                    tmp_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag);
+                    tmp_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real);
+
+                    _phase_real = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
+                    _phase_imag = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
+
+                    /* store the four complex results */
+                    vst2_s16((int16_t*)_out, tmp16);
+                    _out += 4;
+                }
+
+            vst1q_f32((float32_t*)__phase_real, _phase_real);
+            vst1q_f32((float32_t*)__phase_imag, _phase_imag);
+
+            (*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
+        }
+    for(i = 0; i < neon_iters % 4; ++i)
+        {
+            tmp16_ = *_in++;
+            tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
+            *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_)));
+            (*phase) *= phase_inc;
+        }
+}
+
+#endif /* LV_HAVE_NEON */
+
 #endif /* INCLUDED_volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_H */
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h
@@ -94,7 +94,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con

    if (sse_iters > 0)
        {
-            __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
+            __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
            __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];

            realcacc = _mm_setzero_si128();
@@ -105,8 +105,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con

            for(unsigned int number = 0; number < sse_iters; number++)
                {
-                    //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
-                    //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
                    // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
                    a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
                    __builtin_prefetch(_in_a + 8);
@@ -115,7 +113,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
                    c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....

                    c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-                    real = _mm_subs_epi16(c,c_sr);
+                    real = _mm_subs_epi16(c, c_sr);

                    b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
                    a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
@@ -123,7 +121,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
                    imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
                    imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....

-                    imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic!
+                    imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!

                    realcacc = _mm_adds_epi16(realcacc, real);
                    imagcacc = _mm_adds_epi16(imagcacc, imag);
@@ -135,9 +133,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
            realcacc = _mm_and_si128(realcacc, mask_real);
            imagcacc = _mm_and_si128(imagcacc, mask_imag);

-            result = _mm_or_si128(realcacc, imagcacc);
+            a = _mm_or_si128(realcacc, imagcacc);

-            _mm_store_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector
+            _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector

            for (int i = 0; i < 4; ++i)
                {
@@ -202,7 +200,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
                    imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
                    imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....

-                    imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic!
+                    imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!

                    realcacc = _mm_adds_epi16(realcacc, real);
                    imagcacc = _mm_adds_epi16(imagcacc, imag);
@@ -245,46 +243,57 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const

    lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
    lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
-    // for 2-lane vectors, 1st lane holds the real part,
-    // 2nd lane holds the imaginary part
-    int16x4x2_t a_val, b_val, c_val, accumulator;
-    int16x4x2_t tmp_real, tmp_imag;
-    __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
-    accumulator.val[0] = vdup_n_s16(0);
-    accumulator.val[1] = vdup_n_s16(0);
+    *out = lv_cmake((int16_t)0, (int16_t)0);

-    for(number = 0; number < quarter_points; ++number)
+    if (quarter_points > 0)
        {
-            a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
-            b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
-            __builtin_prefetch(a_ptr + 8);
-            __builtin_prefetch(b_ptr + 8);
+            // for 2-lane vectors, 1st lane holds the real part,
+            // 2nd lane holds the imaginary part
+            int16x4x2_t a_val, b_val, c_val, accumulator;
+            int16x4x2_t tmp_real, tmp_imag;
+            __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
+            accumulator.val[0] = vdup_n_s16(0);
+            accumulator.val[1] = vdup_n_s16(0);
+            lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);

-            // multiply the real*real and imag*imag to get real result
-            // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
-            tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
-            // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
-            tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
+            for(number = 0; number < quarter_points; ++number)
+                {
+                    a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
+                    b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
+                    __builtin_prefetch(a_ptr + 8);
+                    __builtin_prefetch(b_ptr + 8);

-            // Multiply cross terms to get the imaginary result
-            // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
-            tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
-            // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
-            tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
+                    // multiply the real*real and imag*imag to get real result
+                    // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
+                    tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
+                    // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
+                    tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);

-            c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
-            c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
+                    // Multiply cross terms to get the imaginary result
+                    // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
+                    tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
+                    // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
+                    tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);

-            accumulator.val[0] = vadd_s16(accumulator.val[0], c_val.val[0]);
-            accumulator.val[1] = vadd_s16(accumulator.val[1], c_val.val[1]);
+                    c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
+                    c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);

-            a_ptr += 4;
-            b_ptr += 4;
+                    accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
+                    accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
+
+                    a_ptr += 4;
+                    b_ptr += 4;
+                }
+
+            vst2_s16((int16_t*)accum_result, accumulator);
+            for (unsigned int i = 0; i < 4; ++i)
+                {
+                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(accum_result[i])), sat_adds16i(lv_cimag(dotProduct), lv_cimag(accum_result[i])));
+                }
+
+            *out = dotProduct;
        }

-    vst2_s16((int16_t*)accum_result, accumulator);
-    *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
-
    // tail case
    for(number = quarter_points * 4; number < num_points; ++number)
        {
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h
@@ -84,6 +84,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu
 #endif /*LV_HAVE_GENERIC*/


+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
+{
+    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            result[n_vec] = lv_cmake(0,0);
+            for (unsigned int n = 0; n < num_points; n++)
+                {
+                    lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))),
+                                             sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n]))));
+                    result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
+                }
+        }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>

@@ -398,8 +417,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
                            tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
                            tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);

-                            accumulator[n_vec].val[0] = vadd_s16(accumulator[n_vec].val[0], tmp.val[0]);
-                            accumulator[n_vec].val[1] = vadd_s16(accumulator[n_vec].val[1], tmp.val[1]);
+                            accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], tmp.val[0]);
+                            accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], tmp.val[1]);
                        }
                    _in_common += 4;
                }
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h
@@ -63,6 +63,30 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t*

 #endif  /* Generic */

+
+#ifdef LV_HAVE_GENERIC
+static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic_sat(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+{
+    int num_a_vectors = 3;
+    lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(unsigned int n = 0; n < num_a_vectors; n++)
+    {
+       in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
+       memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
+    }
+
+    volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
+
+    for(unsigned int n = 0; n < num_a_vectors; n++)
+    {
+        volk_gnsssdr_free(in_a[n]);
+    }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif  /* Generic */
+
+
 #ifdef LV_HAVE_SSE2
 static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
 {
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h
@@ -43,6 +43,7 @@
 *
 * Rotates and multiplies the reference complex vector with an arbitrary number of other complex vectors,
 * accumulates the results and stores them in the output vector.
+ * The rotation is done at a fixed rate per sample, from an initial \p phase offset.
 * This function can be used for Doppler wipe-off and multiple correlator.
 *
 * <b>Dispatcher Prototype</b>
@@ -71,7 +72,7 @@
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <volk_gnsssdr/saturation_arithmetic.h>
 #include <math.h>
-//#include <stdio.h>
+#include <stdio.h>

 #ifdef LV_HAVE_GENERIC

@@ -88,6 +89,19 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc
            tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
            tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+
+            // Regenerate phase
+            if (n % 256 == 0)
+                {
+                    //printf("Phase before regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+#ifdef __cplusplus
+                    (*phase) /= std::abs((*phase));
+#else
+                    (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+#endif
+                    //printf("Phase after regeneration %i: %f,%f  Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
+                }
+
            (*phase) *= phase_inc;
            for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
                {
@@ -101,6 +115,60 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc
 #endif /*LV_HAVE_GENERIC*/


+#ifdef LV_HAVE_GENERIC
+
+static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
+{
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+    const unsigned int ROTATOR_RELOAD = 256;
+    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            result[n_vec] = lv_cmake(0,0);
+        }
+
+    for (unsigned int n = 0; n < num_points / ROTATOR_RELOAD; n++)
+        {
+            for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
+                {
+                    tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+                    tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+                    tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+                    (*phase) *= phase_inc;
+                    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                        {
+                            lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j];
+                            //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
+                            result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
+                        }
+                }
+            /* Regenerate phase */
+#ifdef __cplusplus
+            (*phase) /= std::abs((*phase));
+#else
+            //(*phase) /= cabsf((*phase));
+            (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
+#endif
+        }
+
+    for (unsigned int j = 0; j < num_points % ROTATOR_RELOAD; j++)
+        {
+            tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            (*phase) *= phase_inc;
+            for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    lv_16sc_t tmp = tmp16 * in_a[n_vec][ (num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j ];
+                    //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
+                    result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
+                }
+        }
+}
+
+#endif /*LV_HAVE_GENERIC*/
+
+
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>

@@ -169,6 +237,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
            //next two samples
            _in_common += 2;
            pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            __builtin_prefetch(_in_common + 8);
            //complex 32fc multiplication b=a*two_phase_acc_reg
            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
@@ -212,6 +281,15 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
                    realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
                    imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
                }
+            // Regenerate phase
+            if ((number % 128) == 0)
+                {
+                    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+                    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+                    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+                    tmp2 = _mm_sqrt_ps(tmp1);
+                    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+                }
        }

    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
@@ -233,6 +311,254 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
    free(realcacc);
    free(imagcacc);

+    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+    tmp2 = _mm_sqrt_ps(tmp1);
+    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+
+    _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
+    //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
+    (*phase) = two_phase_acc[0];
+
+    for(unsigned int n  = sse_iters * 4; n < num_points; n++)
+        {
+            tmp16 = in_common[n];  //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
+            tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
+            tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
+            (*phase) *= phase_inc;
+
+            for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    lv_16sc_t tmp = tmp16 * in_a[n_vec][n];
+                    //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n]))));
+                    _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
+                            sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
+                }
+        }
+}
+#endif /* LV_HAVE_SSE3 */
+
+
+#ifdef LV_HAVE_SSE3
+#include <pmmintrin.h>
+
+static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a,  int num_a_vectors, unsigned int num_points)
+{
+    lv_16sc_t dotProduct = lv_cmake(0,0);
+
+    const unsigned int sse_iters = num_points / 4;
+    const unsigned int ROTATOR_RELOAD = 128;
+
+    const lv_16sc_t** _in_a = in_a;
+    const lv_16sc_t* _in_common = in_common;
+    lv_16sc_t* _out = result;
+
+    __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
+
+    //todo dyn mem reg
+
+    __m128i* realcacc;
+    __m128i* imagcacc;
+
+    realcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
+    imagcacc = (__m128i*)calloc(num_a_vectors, sizeof(__m128i)); //calloc also sets memory to 0
+
+    __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl;
+
+    mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+    mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
+
+    // phase rotation registers
+    __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
+    __m128i pc1, pc2;
+    __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
+    two_phase_inc[0] = phase_inc * phase_inc;
+    two_phase_inc[1] = phase_inc * phase_inc;
+    two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
+    __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
+    two_phase_acc[0] = (*phase);
+    two_phase_acc[1] = (*phase) * phase_inc;
+    two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
+    __m128 yl, yh, tmp1, tmp2, tmp3;
+    lv_16sc_t tmp16;
+    lv_32fc_t tmp32;
+
+    for (unsigned int number = 0; number <  sse_iters / ROTATOR_RELOAD; ++number)
+        {
+            for (unsigned int j = 0; j < ROTATOR_RELOAD; j++)
+                {
+                    // Phase rotation on operand in_common starts here:
+                    //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase));
+                    pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+                    //complex 32fc multiplication b=a*two_phase_acc_reg
+                    yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+                    pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+                    //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+                    yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+                    //next two samples
+                    _in_common += 2;
+                    pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+                    __builtin_prefetch(_in_common + 8);
+                    //complex 32fc multiplication b=a*two_phase_acc_reg
+                    yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+                    pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+                    //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+                    yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+                    yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+                    tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+                    tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+                    tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+                    two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+                    // store four rotated in_common samples in the register b
+                    b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic
+
+                    //next two samples
+                    _in_common += 2;
+
+                    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                        {
+                            a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+
+                            c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+                            c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+                            real = _mm_subs_epi16(c, c_sr);
+
+                            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+                            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+                            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+                            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+                            imag = _mm_adds_epi16(imag1, imag2);
+
+                            realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
+                            imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
+                        }
+                }
+            // regenerate phase
+            tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+            tmp2 = _mm_hadd_ps(tmp1, tmp1);
+            tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+            tmp2 = _mm_sqrt_ps(tmp1);
+            two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+        }
+
+    for (unsigned int j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
+        {
+            pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            //next two samples
+            _in_common += 2;
+            pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            __builtin_prefetch(_in_common + 8);
+            //complex 32fc multiplication b=a*two_phase_acc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+            pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic
+
+            //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
+            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
+            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
+            tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+            tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
+            tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+            two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
+
+            // store four rotated in_common samples in the register b
+            b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic
+
+            //next two samples
+            _in_common += 2;
+
+            for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+                {
+                    a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD)  * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
+
+                    c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
+
+                    c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
+                    real = _mm_subs_epi16(c, c_sr);
+
+                    b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
+                    a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
+
+                    imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
+                    imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
+
+                    imag = _mm_adds_epi16(imag1, imag2);
+
+                    realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
+                    imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
+                }
+        }
+
+    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
+        {
+            realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real);
+            imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag);
+
+            a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
+
+            _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
+            dotProduct = lv_cmake(0,0);
+            for (int i = 0; i < 4; ++i)
+                {
+                    dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
+                            sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
+                }
+            _out[n_vec] = dotProduct;
+        }
+
+    free(realcacc);
+    free(imagcacc);
+
+    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+    tmp2 = _mm_sqrt_ps(tmp1);
+    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+
    _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
    //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]);
    (*phase) = two_phase_acc[0];
@@ -303,7 +629,6 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
    for(unsigned int number = 0; number < sse_iters; number++)
        {
            // Phase rotation on operand in_common starts here:
-
            pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
            __builtin_prefetch(_in_common + 8);
            //complex 32fc multiplication b=a*two_phase_acc_reg
@@ -326,6 +651,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
            //next two samples
            _in_common += 2;
            pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
+            __builtin_prefetch(_in_common + 8);
            //complex 32fc multiplication b=a*two_phase_acc_reg
            yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
            yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
@@ -369,6 +695,15 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
                    realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real);
                    imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag);
                }
+            // Regenerate phase
+            if ((number % 256) == 0)
+                {
+                    tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
+                    tmp2 = _mm_hadd_ps(tmp1, tmp1);
+                    tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
+                    tmp2 = _mm_sqrt_ps(tmp1);
+                    two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
+                }
        }

    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
@@ -393,7 +728,6 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
    _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg);
    (*phase) = two_phase_acc[0];

-
    for(unsigned int n  = sse_iters * 4; n < num_points; n++)
        {
            tmp16 = in_common[n];
@@ -428,6 +762,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
    if (neon_iters > 0)
        {
            lv_16sc_t dotProduct = lv_cmake(0,0);
+            float arg_phase0 = cargf(*phase);
+            float arg_phase_inc = cargf(phase_inc);
+            float phase_est;

            lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
            __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
@@ -538,6 +875,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
                            accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], c_val.val[0]);
                            accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], c_val.val[1]);
                        }
+                    // Regenerate phase
+                    if ((number % 256) == 0)
+                        {
+                            phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
+
+                            *phase = lv_cmake(cos(phase_est), sin(phase_est));
+                            phase2 = (lv_32fc_t)(*phase) * phase_inc;
+                            phase3 = phase2 * phase_inc;
+                            phase4 = phase3 * phase_inc;
+
+                            __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+                            __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+                            _phase_real = vld1q_f32(____phase_real);
+                            _phase_imag = vld1q_f32(____phase_imag);
+                        }
                }

            for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
@@ -577,6 +930,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*

 #ifdef LV_HAVE_NEON
 #include <arm_neon.h>
+#include <volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h>

 static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a,  int num_a_vectors, unsigned int num_points)
 {
@@ -592,7 +946,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
    if (neon_iters > 0)
        {
            lv_16sc_t dotProduct = lv_cmake(0,0);
-
+            float arg_phase0 = cargf(*phase);
+            float arg_phase_inc = cargf(phase_inc);
+            float phase_est;
+            //printf("arg phase0: %f", arg_phase0);
            lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
            __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
            __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
@@ -677,6 +1034,37 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
                    _phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]);
                    _phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]);

+                    // Regenerate phase
+                    if ((number % 256) == 0)
+                        {
+                            //printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0]))));
+                            phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc;
+                            //printf("Estimated phase: %f\n\n", cos(phase_est));
+
+                            *phase = lv_cmake(cos(phase_est), sin(phase_est));
+                            phase2 = (lv_32fc_t)(*phase) * phase_inc;
+                            phase3 = phase2 * phase_inc;
+                            phase4 = phase3 * phase_inc;
+
+                            __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
+                            __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
+
+                            _phase_real = vld1q_f32(____phase_real);
+                            _phase_imag = vld1q_f32(____phase_imag);
+
+                            // Round = vmulq_f32(_phase_real, _phase_real);
+                            // Round = vmlaq_f32(Round, _phase_imag, _phase_imag);
+                            //               Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]);
+                            //Round = vrsqrteq_f32(Round);printf("1/sqtr: %f  \n",Round[0]);  
+                            //Round = vrecpeq_f32((Round);
+                            //              _phase_real = vdivq_f32(_phase_real, Round);
+                            //              _phase_imag = vdivq_f32(_phase_imag, Round);
+                            //_phase_real = vmulq_f32(_phase_real, Round);
+                            //_phase_imag = vmulq_f32(_phase_imag, Round);
+                            //printf("After  %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0]));
+
+                        }
+
                    vst1q_f32((float32_t*)__phase_real, _phase_real);
                    vst1q_f32((float32_t*)__phase_imag, _phase_imag);

@@ -708,6 +1096,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
                    _out[n_vec] = dotProduct;
                }
            free(accumulator);
+
            vst1q_f32((float32_t*)__phase_real, _phase_real);
            vst1q_f32((float32_t*)__phase_imag, _phase_imag);

@@ -731,3 +1120,4 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
 #endif /* LV_HAVE_NEON */

 #endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H*/
+
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h
@@ -70,6 +70,36 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_
 #endif  // Generic


+#ifdef LV_HAVE_GENERIC
+static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code,  const lv_16sc_t* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+
+    int num_a_vectors = 3;
+    lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(unsigned int n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
+        }
+    volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points);
+
+    for(unsigned int n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif  // Generic
+
+
 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
 {
@@ -101,6 +131,37 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_1
 #endif // SSE3


+#ifdef LV_HAVE_SSE3
+static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
+{
+    // phases must be normalized. Phase rotator expects a complex exponential input!
+    float rem_carrier_phase_in_rad = 0.345;
+    float phase_step_rad = 0.1;
+    lv_32fc_t phase[1];
+    phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
+    lv_32fc_t phase_inc[1];
+    phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
+
+    int num_a_vectors = 3;
+    lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
+    for(unsigned int n = 0; n < num_a_vectors; n++)
+        {
+            in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
+            memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
+        }
+
+    volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
+
+    for(unsigned int n = 0; n < num_a_vectors; n++)
+        {
+            volk_gnsssdr_free(in_a[n]);
+        }
+    volk_gnsssdr_free(in_a);
+}
+
+#endif // SSE3
+
+
 #ifdef LV_HAVE_SSE3
 static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
 {
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h
@@ -58,6 +58,9 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
    // some others need more iterations *****  ADDED BY GNSS-SDR
    volk_gnsssdr_test_params_t test_params_more_iters = volk_gnsssdr_test_params_t(test_params.tol(), test_params.scalar(),
                test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex());
+    // ... or more tolerance *****  ADDED BY GNSS-SDR
+    volk_gnsssdr_test_params_t test_params_int16 = volk_gnsssdr_test_params_t(16, test_params.scalar(),
+                test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());

    std::vector<volk_gnsssdr_test_case_t> test_cases = boost::assign::list_of

@@ -77,11 +80,11 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
        (VOLK_INIT_TEST(volk_gnsssdr_16ic_x2_dot_prod_16ic, test_params))
        (VOLK_INIT_TEST(volk_gnsssdr_16ic_x2_multiply_16ic, test_params_more_iters))
        (VOLK_INIT_TEST(volk_gnsssdr_16ic_convert_32fc, test_params_more_iters))
-        (VOLK_INIT_PUPP(volk_gnsssdr_16ic_rotatorpuppet_16ic, volk_gnsssdr_16ic_s32fc_x2_rotator_16ic, test_params))
+        (VOLK_INIT_PUPP(volk_gnsssdr_16ic_rotatorpuppet_16ic, volk_gnsssdr_16ic_s32fc_x2_rotator_16ic, test_params_int1))
        (VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerpuppet_16ic, volk_gnsssdr_16ic_resampler_16ic, test_params))
        (VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerxnpuppet_16ic, volk_gnsssdr_16ic_xn_resampler_16ic_xn, test_params))
        (VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_dot_prod_16ic_xn, test_params))
-        (VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params))
+        (VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16))
        ;

    return test_cases;
--- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc
+++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc
@@ -76,10 +76,10 @@ void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n)
                        else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
                        break;
                    case 2:
-                        // 16 bits dot product saturates very fast even with moderate length vectors
-                    	// we produce here only 4 bits input range
-                        if(type.is_signed) ((int16_t *)data)[i] = (int16_t)((int16_t) scaled_rand % 16);
-                        else ((uint16_t *)data)[i] = (uint16_t) (int16_t)((int16_t) scaled_rand % 16);
+                        // 16 bit multiplication saturates very fast
+                    	// we produce here only 3 bits input range
+                        if(type.is_signed) ((int16_t *)data)[i] = (int16_t)((int16_t) scaled_rand % 8);
+                        else ((uint16_t *)data)[i] = (uint16_t) (int16_t)((int16_t) scaled_rand % 8);
                        break;
                    case 1:
                        if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
--- a/src/algorithms/tracking/adapters/CMakeLists.txt
+++ b/src/algorithms/tracking/adapters/CMakeLists.txt
@@ -25,7 +25,6 @@ set(TRACKING_ADAPTER_SOURCES
     galileo_e1_dll_pll_veml_tracking.cc
     galileo_e1_tcp_connector_tracking.cc
     gps_l1_ca_dll_fll_pll_tracking.cc
-     gps_l1_ca_dll_pll_optim_tracking.cc
     gps_l1_ca_dll_pll_tracking.cc
     gps_l1_ca_dll_pll_c_aid_tracking.cc
     gps_l1_ca_tcp_connector_tracking.cc
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_optim_tracking.cc
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_optim_tracking.cc
@@ -1,159 +0,0 @@
-/*!
- * \file gps_l1_ca_dll_pll_optim_tracking.cc
- * \brief  Interface of an adapter of a speed optimized DLL+PLL tracking loop block
- * for GPS L1 C/A to a TrackingInterface
- * \author Javier Arribas, 2012. jarribas(at)cttc.es
- *
- * GPS L1 TRACKING MODULE OPTIMIZED FOR SPEED:
- * - Code Doppler is not compensated in the local replica
- * Code DLL + carrier PLL according to the algorithms described in:
- * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
- * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
- * Approach, Birkha user, 2007
- *
- * -------------------------------------------------------------------------
- *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
- *
- * GNSS-SDR is a software defined Global Navigation
- *          Satellite Systems receiver
- *
- * This file is part of GNSS-SDR.
- *
- * GNSS-SDR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * GNSS-SDR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
- *
- * -------------------------------------------------------------------------
- */
-
-#include "gps_l1_ca_dll_pll_optim_tracking.h"
-#include <cmath>
-#include <glog/logging.h>
-#include "GPS_L1_CA.h"
-#include "configuration_interface.h"
-
-using google::LogMessage;
-
-GpsL1CaDllPllOptimTracking::GpsL1CaDllPllOptimTracking(
-        ConfigurationInterface* configuration, std::string role,
-        unsigned int in_streams, unsigned int out_streams,
-        boost::shared_ptr<gr::msg_queue> queue) :
-        role_(role), in_streams_(in_streams), out_streams_(out_streams),
-        queue_(queue)
-{
-    DLOG(INFO) << "role " << role;
-    //################# CONFIGURATION PARAMETERS ########################
-    int fs_in;
-    int vector_length;
-    int f_if;
-    bool dump;
-    std::string dump_filename;
-    std::string item_type;
-    std::string default_item_type = "gr_complex";
-    float pll_bw_hz;
-    float dll_bw_hz;
-    float early_late_space_chips;
-    item_type = configuration->property(role + ".item_type",default_item_type);
-    //vector_length = configuration->property(role + ".vector_length", 2048);
-    fs_in = configuration->property("GNSS-SDR.internal_fs_hz", 2048000);
-    f_if = configuration->property(role + ".if", 0);
-    dump = configuration->property(role + ".dump", false);
-    pll_bw_hz = configuration->property(role + ".pll_bw_hz", 50.0);
-    dll_bw_hz = configuration->property(role + ".dll_bw_hz", 2.0);
-    early_late_space_chips = configuration->property(role + ".early_late_space_chips", 0.5);
-    std::string default_dump_filename = "./track_ch";
-    dump_filename = configuration->property(role + ".dump_filename", default_dump_filename); //unused!
-    vector_length = round(fs_in / (GPS_L1_CA_CODE_RATE_HZ / GPS_L1_CA_CODE_LENGTH_CHIPS));
-
-    //################# MAKE TRACKING GNURadio object ###################
-    if (item_type.compare("gr_complex") == 0)
-        {
-            item_size_ = sizeof(gr_complex);
-            tracking_ = gps_l1_ca_dll_pll_make_optim_tracking_cc(
-                    f_if,
-                    fs_in,
-                    vector_length,
-                    queue_,
-                    dump,
-                    dump_filename,
-                    pll_bw_hz,
-                    dll_bw_hz,
-                    early_late_space_chips);
-        }
-    else
-        {
-            item_size_ = sizeof(gr_complex);
-            LOG(WARNING) << item_type << " unknown tracking item type.";
-        }
-
-    channel_ = 0;
-    channel_internal_queue_ = 0;
-    DLOG(INFO) << "tracking(" << tracking_->unique_id() << ")";
-}
-
-
-GpsL1CaDllPllOptimTracking::~GpsL1CaDllPllOptimTracking()
-{}
-
-
-void GpsL1CaDllPllOptimTracking::start_tracking()
-{
-    tracking_->start_tracking();
-}
-
-/*
- * Set tracking channel unique ID
- */
-void GpsL1CaDllPllOptimTracking::set_channel(unsigned int channel)
-{
-    channel_ = channel;
-    tracking_->set_channel(channel);
-}
-
-/*
- * Set tracking channel internal queue
- */
-void GpsL1CaDllPllOptimTracking::set_channel_queue(
-        concurrent_queue<int> *channel_internal_queue)
-{
-    channel_internal_queue_ = channel_internal_queue;
-    tracking_->set_channel_queue(channel_internal_queue_);
-}
-
-void GpsL1CaDllPllOptimTracking::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro)
-{
-    tracking_->set_gnss_synchro(p_gnss_synchro);
-}
-
-void GpsL1CaDllPllOptimTracking::connect(gr::top_block_sptr top_block)
-{
-	if(top_block) { /* top_block is not null */};
-	//nothing to connect, now the tracking uses gr_sync_decimator
-}
-
-void GpsL1CaDllPllOptimTracking::disconnect(gr::top_block_sptr top_block)
-{
-	if(top_block) { /* top_block is not null */};
-	//nothing to disconnect, now the tracking uses gr_sync_decimator
-}
-
-gr::basic_block_sptr GpsL1CaDllPllOptimTracking::get_left_block()
-{
-    return tracking_;
-}
-
-gr::basic_block_sptr GpsL1CaDllPllOptimTracking::get_right_block()
-{
-    return tracking_;
-}
-
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_optim_tracking.h
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_optim_tracking.h
@@ -1,119 +0,0 @@
-/*!
- * \file gps_l1_ca_dll_pll_optim_tracking.h
- * \brief  Interface of an adapter of a speed optimized DLL+PLL tracking loop block
- * for GPS L1 C/A to a TrackingInterface
- * \author Javier Arribas, 2012. jarribas(at)cttc.es
- *
- * GPS L1 TRACKING MODULE OPTIMIZED FOR SPEED:
- * - Code Doppler is not compensated in the local replica
- * Code DLL + carrier PLL according to the algorithms described in:
- * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
- * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
- * Approach, Birkha user, 2007
- *
- * -------------------------------------------------------------------------
- *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
- *
- * GNSS-SDR is a software defined Global Navigation
- *          Satellite Systems receiver
- *
- * This file is part of GNSS-SDR.
- *
- * GNSS-SDR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * GNSS-SDR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
- *
- * -------------------------------------------------------------------------
- */
-
-#ifndef GNSS_SDR_GPS_L1_CA_DLL_PLL_OPTIM_TRACKING_H_
-#define GNSS_SDR_GPS_L1_CA_DLL_PLL_OPTIM_TRACKING_H_
-
-#include <string>
-#include <gnuradio/msg_queue.h>
-#include "tracking_interface.h"
-#include "gps_l1_ca_dll_pll_optim_tracking_cc.h"
-
-
-class ConfigurationInterface;
-
-/*!
- * \brief This class implements a code DLL + carrier PLL tracking loop
- */
-class GpsL1CaDllPllOptimTracking : public TrackingInterface
-{
-
-public:
-
-  GpsL1CaDllPllOptimTracking(ConfigurationInterface* configuration,
-            std::string role,
-            unsigned int in_streams,
-            unsigned int out_streams,
-            boost::shared_ptr<gr::msg_queue> queue);
-
-    virtual ~GpsL1CaDllPllOptimTracking();
-
-    std::string role()
-    {
-        return role_;
-    }
-
-    //! Returns "GPS_L1_CA_DLL_PLL_Optim_Tracking"
-    std::string implementation()
-    {
-        return "GPS_L1_CA_DLL_PLL_Optim_Tracking";
-    }
-    size_t item_size()
-    {
-        return item_size_;
-    }
-
-    void connect(gr::top_block_sptr top_block);
-    void disconnect(gr::top_block_sptr top_block);
-    gr::basic_block_sptr get_left_block();
-    gr::basic_block_sptr get_right_block();
-
-
-    /*!
-     * \brief Set tracking channel unique ID
-     */
-    void set_channel(unsigned int channel);
-
-    /*!
-     * \brief Set acquisition/tracking common Gnss_Synchro object pointer
-     * to efficiently exchange synchronization data between acquisition and tracking blocks
-     */
-    void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro);
-
-    /*!
-     * \brief Set tracking channel internal queue
-     */
-    void set_channel_queue(concurrent_queue<int> *channel_internal_queue);
-
-    void start_tracking();
-
-private:
-
-    gps_l1_ca_dll_pll_optim_tracking_cc_sptr tracking_;
-    size_t item_size_;
-
-    unsigned int channel_;
-
-    std::string role_;
-    unsigned int in_streams_;
-    unsigned int out_streams_;
-    boost::shared_ptr<gr::msg_queue> queue_;
-    concurrent_queue<int> *channel_internal_queue_;
-};
-
-#endif // GNSS_SDR_GPS_L1_CA_DLL_PLL_OPTIM_TRACKING_H_
--- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
+++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
@@ -26,7 +26,6 @@ set(TRACKING_GR_BLOCKS_SOURCES
     galileo_e1_dll_pll_veml_tracking_cc.cc
     galileo_e1_tcp_connector_tracking_cc.cc
     gps_l1_ca_dll_fll_pll_tracking_cc.cc
-     gps_l1_ca_dll_pll_optim_tracking_cc.cc
     gps_l1_ca_dll_pll_tracking_cc.cc
     gps_l1_ca_tcp_connector_tracking_cc.cc
     galileo_e5a_dll_pll_tracking_cc.cc
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_optim_tracking_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_optim_tracking_cc.cc
@@ -1,654 +0,0 @@
-/*!
- * \file gps_l1_ca_dll_pll_optim_tracking_cc.cc
- * \brief Implementation of a code DLL + carrier PLL tracking block
- * \author Javier Arribas, 2012. jarribas(at)cttc.es
- * GPS L1 TRACKING MODULE OPTIMIZED FOR SPEED:
- * - Code Doppler is not compensated in the local replica
- * Code DLL + carrier PLL according to the algorithms described in:
- * [1] K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
- * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
- * Approach, Birkha user, 2007
- *
- * -------------------------------------------------------------------------
- *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
- *
- * GNSS-SDR is a software defined Global Navigation
- *          Satellite Systems receiver
- *
- * This file is part of GNSS-SDR.
- *
- * GNSS-SDR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * GNSS-SDR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
- *
- * -------------------------------------------------------------------------
- */
-
-#include "gps_l1_ca_dll_pll_optim_tracking_cc.h"
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <boost/lexical_cast.hpp>
-#include <gnuradio/io_signature.h>
-#include <glog/logging.h>
-#include <volk/volk.h>
-#include "gps_sdr_signal_processing.h"
-#include "tracking_discriminators.h"
-#include "lock_detectors.h"
-#include "GPS_L1_CA.h"
-#include "nco_lib.h"
-#include "control_message_factory.h"
-
-
-
-/*!
- * \todo Include in definition header file
- */
-#define CN0_ESTIMATION_SAMPLES 20
-#define MINIMUM_VALID_CN0 25
-#define MAXIMUM_LOCK_FAIL_COUNTER 50
-#define CARRIER_LOCK_THRESHOLD 0.85
-
-using google::LogMessage;
-
-gps_l1_ca_dll_pll_optim_tracking_cc_sptr
-gps_l1_ca_dll_pll_make_optim_tracking_cc(
-        long if_freq,
-        long fs_in,
-        unsigned int vector_length,
-        boost::shared_ptr<gr::msg_queue> queue,
-        bool dump,
-        std::string dump_filename,
-        float pll_bw_hz,
-        float dll_bw_hz,
-        float early_late_space_chips)
-{
-    return gps_l1_ca_dll_pll_optim_tracking_cc_sptr(new Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc(if_freq,
-            fs_in, vector_length, queue, dump, dump_filename, pll_bw_hz, dll_bw_hz, early_late_space_chips));
-}
-
-
-
-void Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::forecast (int noutput_items,
-        gr_vector_int &ninput_items_required)
-{
-    if (noutput_items != 0)
-        {
-            ninput_items_required[0] = d_gnuradio_forecast_samples; //set the required available samples in each call
-        }
-}
-
-
-
-Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc(
-        long if_freq,
-        long fs_in,
-        unsigned int vector_length,
-        boost::shared_ptr<gr::msg_queue> queue,
-        bool dump,
-        std::string dump_filename,
-        float pll_bw_hz,
-        float dll_bw_hz,
-        float early_late_space_chips) :
-        gr::block("Gps_L1_Ca_Dll_Pll_Tracking_cc",
-                  gr::io_signature::make(1, 1, sizeof(gr_complex)),
-                  gr::io_signature::make(1, 1, sizeof(Gnss_Synchro)))
-{
-    // initialize internal vars
-    d_queue = queue;
-    d_dump = dump;
-    d_if_freq = if_freq;
-    d_fs_in = fs_in;
-    d_vector_length = vector_length;
-    d_gnuradio_forecast_samples = static_cast<int>(d_vector_length) * 2;
-    d_dump_filename = dump_filename;
-
-    // Initialize tracking  ==========================================
-    d_code_loop_filter.set_DLL_BW(dll_bw_hz);
-    d_carrier_loop_filter.set_PLL_BW(pll_bw_hz);
-
-    //--- DLL variables --------------------------------------------------------
-    d_early_late_spc_chips = early_late_space_chips; // Define early-late offset (in chips)
-
-    // Initialization of local code replica
-    // Get space for a vector with the C/A code replica sampled 1x/chip
-    d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS + 2) * sizeof(gr_complex), volk_get_alignment()));
-
-    // Get space for the resampled early / prompt / late local replicas
-    d_early_code = static_cast<gr_complex*>(volk_malloc(2 * d_vector_length * sizeof(gr_complex), volk_get_alignment()));
-    d_prompt_code = static_cast<gr_complex*>(volk_malloc(2 * d_vector_length * sizeof(gr_complex), volk_get_alignment()));
-    d_late_code = static_cast<gr_complex*>(volk_malloc(2 * d_vector_length * sizeof(gr_complex), volk_get_alignment()));
-
-    // space for carrier wipeoff and signal baseband vectors
-    d_carr_sign = static_cast<gr_complex*>(volk_malloc(2*d_vector_length * sizeof(gr_complex), volk_get_alignment()));
-
-    // correlator outputs (scalar)
-    d_Early = static_cast<gr_complex*>(volk_malloc(sizeof(gr_complex), volk_get_alignment()));
-    d_Prompt = static_cast<gr_complex*>(volk_malloc(sizeof(gr_complex), volk_get_alignment()));
-    d_Late = static_cast<gr_complex*>(volk_malloc(sizeof(gr_complex), volk_get_alignment()));
-
-    //--- Perform initializations ------------------------------
-    // define initial code frequency basis of NCO
-    d_code_freq_chips = GPS_L1_CA_CODE_RATE_HZ;
-    // define residual code phase (in chips)
-    d_rem_code_phase_samples = 0.0;
-    // define residual carrier phase
-    d_rem_carr_phase_rad = 0.0;
-
-    // sample synchronization
-    d_sample_counter = 0;
-    //d_sample_counter_seconds = 0;
-    d_acq_sample_stamp = 0;
-
-    d_enable_tracking = false;
-    d_pull_in = false;
-    d_last_seg = 0;
-
-    d_current_prn_length_samples = static_cast<int>(d_vector_length);
-
-    // CN0 estimation and lock detector buffers
-    d_cn0_estimation_counter = 0;
-    d_Prompt_buffer = new gr_complex[CN0_ESTIMATION_SAMPLES];
-    d_carrier_lock_test = 1;
-    d_CN0_SNV_dB_Hz = 0;
-    d_carrier_lock_fail_counter = 0;
-    d_carrier_lock_threshold = CARRIER_LOCK_THRESHOLD;
-
-    systemName["G"] = std::string("GPS");
-
-    d_channel_internal_queue = 0;
-    d_acquisition_gnss_synchro = 0;
-    d_channel = 0;
-    d_acq_code_phase_samples = 0.0;
-    d_acq_carrier_doppler_hz = 0.0;
-    d_carrier_doppler_hz = 0.0;
-    d_acc_carrier_phase_rad = 0.0;
-    d_code_phase_samples = 0.0;
-    d_acc_code_phase_secs = 0.0;
-}
-
-
-void Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::start_tracking()
-{
-    // correct the code phase according to the delay between acq and trk
-    d_acq_code_phase_samples = d_acquisition_gnss_synchro->Acq_delay_samples;
-    d_acq_carrier_doppler_hz = d_acquisition_gnss_synchro->Acq_doppler_hz;
-    d_acq_sample_stamp = d_acquisition_gnss_synchro->Acq_samplestamp_samples;
-
-    long int acq_trk_diff_samples;
-    double acq_trk_diff_seconds;
-    acq_trk_diff_samples = static_cast<long int>(d_sample_counter) - static_cast<long int>(d_acq_sample_stamp); //-d_vector_length;
-    LOG(INFO) << "Number of samples between Acquisition and Tracking =" << acq_trk_diff_samples;
-    acq_trk_diff_seconds = static_cast<float>(acq_trk_diff_samples) / static_cast<float>(d_fs_in);
-    //doppler effect
-    // Fd=(C/(C+Vr))*F
-    double radial_velocity;
-    radial_velocity = (GPS_L1_FREQ_HZ + d_acq_carrier_doppler_hz) / GPS_L1_FREQ_HZ;
-    // new chip and prn sequence periods based on acq Doppler
-    double T_chip_mod_seconds;
-    double T_prn_mod_seconds;
-    double T_prn_mod_samples;
-    d_code_freq_chips = radial_velocity * GPS_L1_CA_CODE_RATE_HZ;
-    T_chip_mod_seconds = 1/d_code_freq_chips;
-    T_prn_mod_seconds = T_chip_mod_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS;
-    T_prn_mod_samples = T_prn_mod_seconds * static_cast<float>(d_fs_in);
-    d_current_prn_length_samples = round(T_prn_mod_samples);
-
-    double T_prn_true_seconds = GPS_L1_CA_CODE_LENGTH_CHIPS / GPS_L1_CA_CODE_RATE_HZ;
-    double T_prn_true_samples = T_prn_true_seconds * static_cast<float>(d_fs_in);
-    double T_prn_diff_seconds = T_prn_true_seconds - T_prn_mod_seconds;
-    double N_prn_diff = acq_trk_diff_seconds / T_prn_true_seconds;
-    double corrected_acq_phase_samples, delay_correction_samples;
-    corrected_acq_phase_samples = fmod((d_acq_code_phase_samples + T_prn_diff_seconds * N_prn_diff * static_cast<float>(d_fs_in)), T_prn_true_samples);
-    if (corrected_acq_phase_samples < 0)
-        {
-            corrected_acq_phase_samples = T_prn_mod_samples + corrected_acq_phase_samples;
-        }
-    delay_correction_samples = d_acq_code_phase_samples - corrected_acq_phase_samples;
-    d_acq_code_phase_samples = corrected_acq_phase_samples;
-    d_carrier_doppler_hz = d_acq_carrier_doppler_hz;
-
-    // DLL/PLL filter initialization
-    d_carrier_loop_filter.initialize(); //initialize the carrier filter
-    d_code_loop_filter.initialize();    //initialize the code filter
-
-    // generate local reference ALWAYS starting at chip 1 (1 sample per chip)
-    gps_l1_ca_code_gen_complex(&d_ca_code[1], d_acquisition_gnss_synchro->PRN, 0);
-    d_ca_code[0] = d_ca_code[static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS)];
-    d_ca_code[static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS) + 1] = d_ca_code[1];
-
-    //******************************************************************************
-    // Experimental: pre-sampled local signal replica at nominal code frequency.
-    // No code doppler correction
-    double tcode_chips;
-    int associated_chip_index;
-    int code_length_chips = static_cast<float>(GPS_L1_CA_CODE_LENGTH_CHIPS);
-    double code_phase_step_chips;
-    int early_late_spc_samples;
-    int epl_loop_length_samples;
-
-    // unified loop for E, P, L code vectors
-    code_phase_step_chips = (static_cast<double>(d_code_freq_chips)) / (static_cast<double>(d_fs_in));
-    tcode_chips = 0;
-
-    // Alternative EPL code generation (40% of speed improvement!)
-    early_late_spc_samples = round(d_early_late_spc_chips / code_phase_step_chips);
-    epl_loop_length_samples = d_current_prn_length_samples  +early_late_spc_samples*2;
-    for (int i = 0; i < epl_loop_length_samples; i++)
-        {
-            associated_chip_index = 1 + round(fmod(tcode_chips - d_early_late_spc_chips, code_length_chips));
-            d_early_code[i] = d_ca_code[associated_chip_index];
-            tcode_chips = tcode_chips + code_phase_step_chips;
-        }
-
-    memcpy(d_prompt_code, &d_early_code[early_late_spc_samples], d_current_prn_length_samples* sizeof(gr_complex));
-    memcpy(d_late_code, &d_early_code[early_late_spc_samples*2], d_current_prn_length_samples* sizeof(gr_complex));
-    //******************************************************************************
-
-    d_carrier_lock_fail_counter = 0;
-    d_rem_code_phase_samples = 0;
-    d_rem_carr_phase_rad = 0;
-    d_acc_carrier_phase_rad = 0;
-
-    d_code_phase_samples = d_acq_code_phase_samples;
-
-    std::string sys_ = &d_acquisition_gnss_synchro->System;
-    sys = sys_.substr(0,1);
-
-    // DEBUG OUTPUT
-    std::cout << "Tracking start on channel " << d_channel << " for satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) << std::endl;
-    LOG(INFO) << "Starting tracking of satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) << " on channel " << d_channel;
-
-    // enable tracking
-    d_pull_in = true;
-    d_enable_tracking = true;
-
-    LOG(INFO) << "PULL-IN Doppler [Hz]=" << d_carrier_doppler_hz
-              << " Code Phase correction [samples]=" << delay_correction_samples
-              << " PULL-IN Code Phase [samples]=" << d_acq_code_phase_samples << std::endl;
-}
-
-
-
-void Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::update_local_code()
-{
-    double tcode_chips;
-    double rem_code_phase_chips;
-    int associated_chip_index;
-    int code_length_chips = static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS);
-    double code_phase_step_chips;
-    int early_late_spc_samples;
-    int epl_loop_length_samples;
-
-    // unified loop for E, P, L code vectors
-    code_phase_step_chips = (static_cast<double>(d_code_freq_chips)) / (static_cast<double>(d_fs_in));
-    rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in);
-    tcode_chips = -rem_code_phase_chips;
-
-    //EPL code generation
-    early_late_spc_samples = round(d_early_late_spc_chips / code_phase_step_chips);
-    epl_loop_length_samples = d_current_prn_length_samples + early_late_spc_samples*2;
-    for (int i = 0; i < epl_loop_length_samples; i++)
-        {
-            associated_chip_index = 1 + round(fmod(tcode_chips - d_early_late_spc_chips, code_length_chips));
-            d_early_code[i] = d_ca_code[associated_chip_index];
-            tcode_chips = tcode_chips + code_phase_step_chips;
-        }
-
-    memcpy(d_prompt_code, &d_early_code[early_late_spc_samples], d_current_prn_length_samples* sizeof(gr_complex));
-    memcpy(d_late_code, &d_early_code[early_late_spc_samples*2], d_current_prn_length_samples* sizeof(gr_complex));
-}
-
-
-void Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::update_local_carrier()
-{
-    float phase_step_rad;
-    phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
-    fxp_nco(d_carr_sign, d_current_prn_length_samples, d_rem_carr_phase_rad, phase_step_rad);
-}
-
-
-Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::~Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc()
-{
-    d_dump_file.close();
-
-    volk_free(d_prompt_code);
-    volk_free(d_late_code);
-    volk_free(d_early_code);
-    volk_free(d_carr_sign);
-    volk_free(d_Early);
-    volk_free(d_Prompt);
-    volk_free(d_Late);
-
-    delete[] d_ca_code;
-    delete[] d_Prompt_buffer;
-}
-
-
-// Tracking signal processing
-int Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::general_work (int noutput_items, gr_vector_int &ninput_items,
-        gr_vector_const_void_star &input_items, gr_vector_void_star &output_items)
-{
-    // stream to collect cout calls to improve thread safety
-    std::stringstream tmp_str_stream;
-    double carr_error_hz = 0.0;
-    double carr_error_filt_hz = 0.0;
-    double code_error_chips = 0.0;
-    double code_error_filt_chips = 0.0;
-
-    if (d_enable_tracking == true)
-        {
-            /*
-             * Receiver signal alignment
-             */
-            if (d_pull_in == true)
-                {
-                    int samples_offset;
-                    float acq_trk_shif_correction_samples;
-                    int acq_to_trk_delay_samples;
-                    acq_to_trk_delay_samples = d_sample_counter - d_acq_sample_stamp;
-                    acq_trk_shif_correction_samples = d_current_prn_length_samples - fmod(static_cast<float>(acq_to_trk_delay_samples), static_cast<float>(d_current_prn_length_samples));
-                    samples_offset = round(d_acq_code_phase_samples + acq_trk_shif_correction_samples);
-                    d_sample_counter = d_sample_counter + samples_offset; //count for the processed samples
-                    d_pull_in = false;
-                    consume_each(samples_offset); //shift input to perform alignment with local replica
-                    return 1;
-                }
-            // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder
-            Gnss_Synchro current_synchro_data;
-            // Fill the acquisition data
-            current_synchro_data = *d_acquisition_gnss_synchro;
-
-            // Block input data and block output stream pointers
-            const gr_complex* in = (gr_complex*) input_items[0]; //PRN start block alignment
-            Gnss_Synchro **out = (Gnss_Synchro **) &output_items[0];
-
-            // Generate local code and carrier replicas (using \hat{f}_d(k-1))
-            //update_local_code(); //disabled in the speed optimized tracking!
-            update_local_carrier();
-
-            // perform Early, Prompt and Late correlation
-#if USING_VOLK_CW_EPL_CORR_CUSTOM
-            d_correlator.Carrier_wipeoff_and_EPL_volk_custom(d_current_prn_length_samples,
-                    in,
-                    d_carr_sign,
-                    d_early_code,
-                    d_prompt_code,
-                    d_late_code,
-                    d_Early,
-                    d_Prompt,
-                    d_Late);
-#else
-            d_correlator.Carrier_wipeoff_and_EPL_volk(d_current_prn_length_samples,
-                    in,
-                    d_carr_sign,
-                    d_early_code,
-                    d_prompt_code,
-                    d_late_code,
-                    d_Early,
-                    d_Prompt,
-                    d_Late);
-#endif
-            // ################## PLL ##########################################################
-            // PLL discriminator
-            carr_error_hz = pll_cloop_two_quadrant_atan(*d_Prompt) / GPS_TWO_PI;
-            // Carrier discriminator filter
-            carr_error_filt_hz = d_carrier_loop_filter.get_carrier_nco(carr_error_hz);
-            // New carrier Doppler frequency estimation
-            d_carrier_doppler_hz = d_acq_carrier_doppler_hz + carr_error_filt_hz;
-            // New code Doppler frequency estimation
-            d_code_freq_chips = GPS_L1_CA_CODE_RATE_HZ + ((d_carrier_doppler_hz * GPS_L1_CA_CODE_RATE_HZ) / GPS_L1_FREQ_HZ);
-            //carrier phase accumulator for (K) doppler estimation
-            d_acc_carrier_phase_rad -= GPS_TWO_PI * d_carrier_doppler_hz * GPS_L1_CA_CODE_PERIOD;
-            //remnant carrier phase to prevent overflow in the code NCO
-            d_rem_carr_phase_rad = d_rem_carr_phase_rad + GPS_TWO_PI * d_carrier_doppler_hz * GPS_L1_CA_CODE_PERIOD;
-            d_rem_carr_phase_rad = fmod(d_rem_carr_phase_rad, GPS_TWO_PI);
-
-            // ################## DLL ##########################################################
-            // DLL discriminator
-            code_error_chips = dll_nc_e_minus_l_normalized(*d_Early, *d_Late); //[chips/Ti]
-            // Code discriminator filter
-            code_error_filt_chips = d_code_loop_filter.get_code_nco(code_error_chips); //[chips/second]
-            //Code phase accumulator
-            double code_error_filt_secs;
-            code_error_filt_secs = (GPS_L1_CA_CODE_PERIOD * code_error_filt_chips) / GPS_L1_CA_CODE_RATE_HZ; //[seconds]
-            d_acc_code_phase_secs = d_acc_code_phase_secs + code_error_filt_secs;
-
-            // ################## CARRIER AND CODE NCO BUFFER ALIGNEMENT #######################
-            // keep alignment parameters for the next input buffer
-            double T_chip_seconds;
-            double T_prn_seconds;
-            double T_prn_samples;
-            double K_blk_samples;
-            // Compute the next buffer length based in the new period of the PRN sequence and the code phase error estimation
-            T_chip_seconds = 1.0 / d_code_freq_chips;
-            T_prn_seconds = T_chip_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS;
-            T_prn_samples = T_prn_seconds * static_cast<double>(d_fs_in);
-            K_blk_samples = T_prn_samples + d_rem_code_phase_samples + code_error_filt_secs * static_cast<double>(d_fs_in);
-            d_current_prn_length_samples = round(K_blk_samples); //round to a discrete samples
-            //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample
-
-            // ####### CN0 ESTIMATION AND LOCK DETECTORS ######
-            if (d_cn0_estimation_counter < CN0_ESTIMATION_SAMPLES)
-                {
-                    // fill buffer with prompt correlator output values
-                    d_Prompt_buffer[d_cn0_estimation_counter] = *d_Prompt;
-                    d_cn0_estimation_counter++;
-                }
-            else
-                {
-                    d_cn0_estimation_counter = 0;
-                    // Code lock indicator
-                    d_CN0_SNV_dB_Hz = cn0_svn_estimator(d_Prompt_buffer, CN0_ESTIMATION_SAMPLES, d_fs_in, GPS_L1_CA_CODE_LENGTH_CHIPS);
-                    // Carrier lock indicator
-                    d_carrier_lock_test = carrier_lock_detector(d_Prompt_buffer, CN0_ESTIMATION_SAMPLES);
-                    // Loss of lock detection
-                    if (d_carrier_lock_test < d_carrier_lock_threshold or d_CN0_SNV_dB_Hz < MINIMUM_VALID_CN0)
-                        {
-                            d_carrier_lock_fail_counter++;
-                        }
-                    else
-                        {
-                            if (d_carrier_lock_fail_counter > 0) d_carrier_lock_fail_counter--;
-                        }
-                    if (d_carrier_lock_fail_counter > MAXIMUM_LOCK_FAIL_COUNTER)
-                        {
-                            std::cout << "Loss of lock in channel " << d_channel << "!" << std::endl;
-                            LOG(INFO) << "Loss of lock in channel " << d_channel << "!";
-                            std::unique_ptr<ControlMessageFactory> cmf(new ControlMessageFactory());
-                            if (d_queue != gr::msg_queue::sptr())
-                                {
-                                    d_queue->handle(cmf->GetQueueMessage(d_channel, 2));
-                                }
-                            d_carrier_lock_fail_counter = 0;
-                            d_enable_tracking = false; // TODO: check if disabling tracking is consistent with the channel state machine
-                        }
-                }
-            // ########### Output the tracking data to navigation and PVT ##########
-            current_synchro_data.Prompt_I = static_cast<double>((*d_Prompt).real());
-            current_synchro_data.Prompt_Q = static_cast<double>((*d_Prompt).imag());
-            // Tracking_timestamp_secs is aligned with the PRN start sample
-            //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter + (double)d_current_prn_length_samples + (double)d_rem_code_phase_samples) / static_cast<double>(d_fs_in);
-            current_synchro_data.Tracking_timestamp_secs = (static_cast<double>(d_sample_counter) + static_cast<double>(d_rem_code_phase_samples)) / static_cast<double>(d_fs_in);
-            d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample
-            // This tracking block aligns the Tracking_timestamp_secs with the start sample of the PRN, thus, Code_phase_secs=0
-            current_synchro_data.Code_phase_secs = 0;
-            current_synchro_data.Carrier_phase_rads = static_cast<double>(d_acc_carrier_phase_rad);
-            current_synchro_data.Carrier_Doppler_hz = static_cast<double>(d_carrier_doppler_hz);
-            current_synchro_data.CN0_dB_hz = static_cast<double>(d_CN0_SNV_dB_Hz);
-            current_synchro_data.Flag_valid_pseudorange = false;
-            *out[0] = current_synchro_data;
-
-            // ########## DEBUG OUTPUT
-            /*!
-             *  \todo The stop timer has to be moved to the signal source!
-             */
-            if (floor(d_sample_counter / d_fs_in) != d_last_seg)
-                {
-                    d_last_seg = floor(d_sample_counter / d_fs_in);
-
-                    if (d_channel == 0)
-                        {
-                            // debug: Second counter in channel 0
-                            tmp_str_stream << "Current input signal time = " << d_last_seg << " [s]" << std::endl;
-                            std::cout << tmp_str_stream.rdbuf();
-                            LOG(INFO) << tmp_str_stream.rdbuf();
-                        }
-
-                    tmp_str_stream << "Tracking CH " << d_channel <<  ": Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN)
-                                   << ", Doppler=" << d_carrier_doppler_hz << " [Hz] CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]" << std::endl;
-                    //std::cout << tmp_str_stream.rdbuf() << std::flush;
-                    LOG(INFO) << tmp_str_stream.rdbuf();
-                    //if (d_channel == 0 || d_last_seg==5) d_carrier_lock_fail_counter=500; //DEBUG: force unlock!
-                }
-        }
-    else
-        {
-            // ########## DEBUG OUTPUT (TIME ONLY for channel 0 when tracking is disabled)
-            /*!
-             *  \todo The stop timer has to be moved to the signal source!
-             */
-            // stream to collect cout calls to improve thread safety
-            std::stringstream tmp_str_stream;
-            if (floor(d_sample_counter / d_fs_in) != d_last_seg)
-                {
-                    d_last_seg = floor(d_sample_counter / d_fs_in);
-
-                    if (d_channel == 0)
-                        {
-                            // debug: Second counter in channel 0
-                            tmp_str_stream << "Current input signal time = " << d_last_seg << " [s]" << std::endl << std::flush;
-                            std::cout << tmp_str_stream.rdbuf() << std::flush;
-                        }
-                }
-            *d_Early = gr_complex(0,0);
-            *d_Prompt = gr_complex(0,0);
-            *d_Late = gr_complex(0,0);
-            Gnss_Synchro **out = (Gnss_Synchro **) &output_items[0]; //block output streams pointer
-            // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder
-            d_acquisition_gnss_synchro->Flag_valid_pseudorange = false;
-            *out[0] = *d_acquisition_gnss_synchro;
-        }
-
-    if(d_dump)
-        {
-            // MULTIPLEXED FILE RECORDING - Record results to file
-            float prompt_I;
-            float prompt_Q;
-            float tmp_E, tmp_P, tmp_L;
-            float tmp_float;
-            double tmp_double;
-            prompt_I = (*d_Prompt).real();
-            prompt_Q = (*d_Prompt).imag();
-            tmp_E = std::abs<float>(*d_Early);
-            tmp_P = std::abs<float>(*d_Prompt);
-            tmp_L = std::abs<float>(*d_Late);
-            try
-            {
-                    // EPR
-                    d_dump_file.write((char*)&tmp_E, sizeof(float));
-                    d_dump_file.write((char*)&tmp_P, sizeof(float));
-                    d_dump_file.write((char*)&tmp_L, sizeof(float));
-                    // PROMPT I and Q (to analyze navigation symbols)
-                    d_dump_file.write((char*)&prompt_I, sizeof(float));
-                    d_dump_file.write((char*)&prompt_Q, sizeof(float));
-                    // PRN start sample stamp
-                    //tmp_float=(float)d_sample_counter;
-                    d_dump_file.write((char*)&d_sample_counter, sizeof(unsigned long int));
-                    // accumulated carrier phase
-                    tmp_float = d_acc_carrier_phase_rad;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-
-                    // carrier and code frequency
-                    tmp_float = d_carrier_doppler_hz;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-                    tmp_float = d_code_freq_chips;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-
-                    //PLL commands
-                    tmp_float = carr_error_hz;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-                    tmp_float = carr_error_filt_hz;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-
-                    //DLL commands
-                    tmp_float = code_error_chips;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-                    tmp_float = code_error_filt_chips;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-
-                    // CN0 and carrier lock test
-                    tmp_float = d_CN0_SNV_dB_Hz;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-                    tmp_float = d_carrier_lock_test;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-
-                    // AUX vars (for debug purposes)
-                    tmp_float = d_rem_code_phase_samples;
-                    d_dump_file.write((char*)&tmp_float, sizeof(float));
-                    tmp_double = (double)(d_sample_counter + d_current_prn_length_samples);
-                    d_dump_file.write((char*)&tmp_double, sizeof(double));
-            }
-            catch (std::ifstream::failure& e)
-            {
-                    LOG(WARNING) << "Exception writing trk dump file " << e.what();
-            }
-        }
-
-    consume_each(d_current_prn_length_samples); // this is necesary in gr_block derivates
-    d_sample_counter += d_current_prn_length_samples; //count for the processed samples
-    if((noutput_items == 0) || (ninput_items[0] == 0))
-        {
-            LOG(WARNING) << "noutput_items = 0";
-        }
-    return 1; //output tracking result ALWAYS even in the case of d_enable_tracking==false
-}
-
-
-
-void Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::set_channel(unsigned int channel)
-{
-    d_channel = channel;
-    LOG(INFO) << "Tracking Channel set to " << d_channel;
-    // ############# ENABLE DATA FILE LOG #################
-    if (d_dump == true)
-        {
-            if (d_dump_file.is_open() == false)
-                {
-                    try
-                    {
-                            d_dump_filename.append(boost::lexical_cast<std::string>(d_channel));
-                            d_dump_filename.append(".dat");
-                            d_dump_file.exceptions (std::ifstream::failbit | std::ifstream::badbit);
-                            d_dump_file.open(d_dump_filename.c_str(), std::ios::out | std::ios::binary);
-                            LOG(INFO) << "Tracking dump enabled on channel " << d_channel << " Log file: " << d_dump_filename.c_str();
-                    }
-                    catch (std::ifstream::failure& e)
-                    {
-                            LOG(WARNING) << "channel " << d_channel << " Exception opening trk dump file " << e.what();
-                    }
-                }
-        }
-}
-
-
-void Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::set_channel_queue(concurrent_queue<int> *channel_internal_queue)
-{
-    d_channel_internal_queue = channel_internal_queue;
-}
-
-void Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro)
-{
-    d_acquisition_gnss_synchro = p_gnss_synchro;
-
-}
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_optim_tracking_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_optim_tracking_cc.h
@@ -1,181 +0,0 @@
-/*!
- * \file gps_l1_ca_dll_pll_optim_tracking_cc.h
- * \brief Implementation of a code DLL + carrier PLL tracking block
- * \author Javier Arribas, 2012. jarribas(at)cttc.es
- * GPS L1 TRACKING MODULE OPTIMIZED FOR SPEED:
- * - Code Doppler is not compensated in the local replica
- * Code DLL + carrier PLL according to the algorithms described in:
- * [1] K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
- * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
- * Approach, Birkha user, 2007
- *
- * -------------------------------------------------------------------------
- *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
- *
- * GNSS-SDR is a software defined Global Navigation
- *          Satellite Systems receiver
- *
- * This file is part of GNSS-SDR.
- *
- * GNSS-SDR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * GNSS-SDR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
- *
- * -------------------------------------------------------------------------
- */
-
-#ifndef GNSS_SDR_GPS_L1_CA_DLL_PLL_OPTIM_TRACKING_CC_H
-#define GNSS_SDR_GPS_L1_CA_DLL_PLL_OPTIM_TRACKING_CC_H
-
-#include <fstream>
-#include <map>
-#include <string>
-#include <gnuradio/block.h>
-#include <gnuradio/msg_queue.h>
-#include "concurrent_queue.h"
-#include "gnss_synchro.h"
-#include "tracking_2nd_DLL_filter.h"
-#include "tracking_2nd_PLL_filter.h"
-#include "correlator.h"
-
-class Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc;
-
-typedef boost::shared_ptr<Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc> gps_l1_ca_dll_pll_optim_tracking_cc_sptr;
-
-gps_l1_ca_dll_pll_optim_tracking_cc_sptr gps_l1_ca_dll_pll_make_optim_tracking_cc(long if_freq,
-                                                                                  long fs_in,
-                                                                                  unsigned int vector_length,
-                                                                                  boost::shared_ptr<gr::msg_queue> queue,
-                                                                                  bool dump,
-                                                                                  std::string dump_filename,
-                                                                                  float pll_bw_hz,
-                                                                                  float dll_bw_hz,
-                                                                                  float early_late_space_chips);
-
-
-/*!
- * \brief This class implements a DLL + PLL tracking loop block
- */
-class Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc: public gr::block
-{
-public:
-    ~Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc();
-
-    void set_channel(unsigned int channel);
-    void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro);
-    void start_tracking();
-    void set_channel_queue(concurrent_queue<int> *channel_internal_queue);
-
-    int general_work (int noutput_items, gr_vector_int &ninput_items,
-            gr_vector_const_void_star &input_items, gr_vector_void_star &output_items);
-
-    void forecast (int noutput_items, gr_vector_int &ninput_items_required);
-private:
-    friend gps_l1_ca_dll_pll_optim_tracking_cc_sptr
-    gps_l1_ca_dll_pll_make_optim_tracking_cc(long if_freq,
-            long fs_in, unsigned
-            int vector_length,
-            boost::shared_ptr<gr::msg_queue> queue,
-            bool dump,
-            std::string dump_filename,
-            float pll_bw_hz,
-            float dll_bw_hz,
-            float early_late_space_chips);
-
-    Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc(long if_freq,
-            long fs_in, unsigned
-            int vector_length,
-            boost::shared_ptr<gr::msg_queue> queue,
-            bool dump,
-            std::string dump_filename,
-            float pll_bw_hz,
-            float dll_bw_hz,
-            float early_late_space_chips);
-    void update_local_code();
-    void update_local_carrier();
-
-    // tracking configuration vars
-    boost::shared_ptr<gr::msg_queue> d_queue;
-    concurrent_queue<int> *d_channel_internal_queue;
-    unsigned int d_vector_length;
-    bool d_dump;
-
-    Gnss_Synchro* d_acquisition_gnss_synchro;
-    unsigned int d_channel;
-    int d_last_seg;
-    long d_if_freq;
-    long d_fs_in;
-
-    double d_early_late_spc_chips;
-
-    gr_complex* d_ca_code;
-
-    gr_complex* d_early_code;
-    gr_complex* d_late_code;
-    gr_complex* d_prompt_code;
-    gr_complex* d_carr_sign;
-
-    gr_complex *d_Early;
-    gr_complex *d_Prompt;
-    gr_complex *d_Late;
-
-    // remaining code phase and carrier phase between tracking loops
-    double d_rem_code_phase_samples;
-    double d_rem_carr_phase_rad;
-
-    // PLL and DLL filter library
-    Tracking_2nd_DLL_filter d_code_loop_filter;
-    Tracking_2nd_PLL_filter d_carrier_loop_filter;
-
-    // acquisition
-    double d_acq_code_phase_samples;
-    double d_acq_carrier_doppler_hz;
-    // correlator
-    Correlator d_correlator;
-
-    // tracking vars
-    double d_code_freq_chips;
-    double d_carrier_doppler_hz;
-    double d_acc_carrier_phase_rad;
-    double d_code_phase_samples;
-    double d_acc_code_phase_secs;
-
-    //PRN period in samples
-    int d_current_prn_length_samples;
-
-    //processing samples counters
-    unsigned long int d_sample_counter;
-    unsigned long int d_acq_sample_stamp;
-
-    // CN0 estimation and lock detector
-    int d_cn0_estimation_counter;
-    gr_complex* d_Prompt_buffer;
-    double d_carrier_lock_test;
-    double d_CN0_SNV_dB_Hz;
-    double d_carrier_lock_threshold;
-    int d_carrier_lock_fail_counter;
-
-    // control vars
-    int d_gnuradio_forecast_samples;
-    bool d_enable_tracking;
-    bool d_pull_in;
-
-    // file dump
-    std::string d_dump_filename;
-    std::ofstream d_dump_file;
-
-    std::map<std::string, std::string> systemName;
-    std::string sys;
-};
-
-#endif //GNSS_SDR_GPS_L1_CA_DLL_PLL_OPTIM_TRACKING_CC_H
--- a/src/core/receiver/gnss_block_factory.cc
+++ b/src/core/receiver/gnss_block_factory.cc
@@ -80,7 +80,6 @@
 #include "galileo_e5a_noncoherent_iq_acquisition_caf.h"
 #include "gps_l1_ca_dll_pll_tracking.h"
 #include "gps_l1_ca_dll_pll_c_aid_tracking.h"
-#include "gps_l1_ca_dll_pll_optim_tracking.h"
 #include "gps_l1_ca_dll_fll_pll_tracking.h"
 #include "gps_l1_ca_tcp_connector_tracking.h"
 #include "galileo_e1_dll_pll_veml_tracking.h"
@@ -1324,12 +1323,6 @@ std::unique_ptr<GNSSBlockInterface> GNSSBlockFactory::GetBlock(
                    out_streams, queue));
            block = std::move(block_);
        }
-    else if (implementation.compare("GPS_L1_CA_DLL_PLL_Optim_Tracking") == 0)
-        {
-            std::unique_ptr<GNSSBlockInterface> block_(new GpsL1CaDllPllOptimTracking(configuration.get(), role, in_streams,
-                    out_streams, queue));
-            block = std::move(block_);
-        }
    else if (implementation.compare("GPS_L1_CA_DLL_FLL_PLL_Tracking") == 0)
        {
            std::unique_ptr<GNSSBlockInterface> block_(new GpsL1CaDllFllPllTracking(configuration.get(), role, in_streams,
@@ -1589,12 +1582,6 @@ std::unique_ptr<TrackingInterface> GNSSBlockFactory::GetTrkBlock(
                    out_streams, queue));
            block = std::move(block_);
        }
-    else if (implementation.compare("GPS_L1_CA_DLL_PLL_Optim_Tracking") == 0)
-        {
-            std::unique_ptr<TrackingInterface> block_(new GpsL1CaDllPllOptimTracking(configuration.get(), role, in_streams,
-                    out_streams, queue));
-            block = std::move(block_);
-        }
    else if (implementation.compare("GPS_L1_CA_DLL_FLL_PLL_Tracking") == 0)
        {
            std::unique_ptr<TrackingInterface> block_(new GpsL1CaDllFllPllTracking(configuration.get(), role, in_streams,