Added cmake version check. Deleted original volk

Added cmake version checks. Deleted original volk protokernels. Fixes.
2025-10-31 15:23:04 +00:00 · 2014-10-27 05:19:26 +01:00
parent 4128b059b1
commit db738c7ea9
16 changed files with 56 additions and 2393 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -205,6 +205,14 @@ set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "")
 # Append -O2 optimization flag for Debug builds
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O2")
 ################################################################################
 # Checkout cmake version
 ################################################################################
 if(CMAKE_VERSION VERSION_LESS 2.8.8)
      message(STATUS "Your CMake version is too old and does not support some features required by GNSS-SDR. CMake version must be at least 2.8.8. For more information check https://github.com/joakimkarlsson/bandit/issues/40")
      message(FATAL_ERROR "Fatal error: CMake >= 2.8.8 required.")
 endif(CMAKE_VERSION VERSION_LESS 2.8.8)
 ################################################################################
 # Checkout compiler version
 ################################################################################
--- a/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc
+++ b/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc
@@ -179,27 +179,18 @@ int main(int argc, char *argv[]) {
     VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
     VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
    // Until we can update the config on a kernel by kernel basis
    // do not overwrite volk_config when using a regex.
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h
@@ -1,241 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
 #define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
 #include <inttypes.h>
 #include <stdio.h>
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
  /*!
    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
    \param inputVector The 16 bit input data buffer
    \param outputVector The floating point output data buffer
    \param scalar The value divided against each point in the output buffer
    \param num_points The number of data values to be converted
    \note Output buffer does NOT need to be properly aligned
  */
 static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int eighthPoints = num_points / 8;
     float* outputVectorPtr = outputVector;
    __m128 invScalar = _mm_set_ps1(1.0/scalar);
    int16_t* inputPtr = (int16_t*)inputVector;
    __m128i inputVal;
    __m128i inputVal2;
    __m128 ret;
    for(;number < eighthPoints; number++){
      // Load the 8 values
      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
      // Shift the input data to the right by 64 bits ( 8 bytes )
      inputVal2 = _mm_srli_si128(inputVal, 8);
      // Convert the lower 4 values into 32 bit words
      inputVal = _mm_cvtepi16_epi32(inputVal);
      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
      ret = _mm_cvtepi32_ps(inputVal);
      ret = _mm_mul_ps(ret, invScalar);
      _mm_storeu_ps(outputVectorPtr, ret);
      outputVectorPtr += 4;
      ret = _mm_cvtepi32_ps(inputVal2);
      ret = _mm_mul_ps(ret, invScalar);
      _mm_storeu_ps(outputVectorPtr, ret);
      outputVectorPtr += 4;
      inputPtr += 8;
    }
    number = eighthPoints * 8;
    for(; number < num_points; number++){
      outputVector[number] =((float)(inputVector[number])) / scalar;
    }
 }
 #endif /* LV_HAVE_SSE4_1 */
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
  /*!
    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
    \param inputVector The 16 bit input data buffer
    \param outputVector The floating point output data buffer
    \param scalar The value divided against each point in the output buffer
    \param num_points The number of data values to be converted
    \note Output buffer does NOT need to be properly aligned
  */
 static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    float* outputVectorPtr = outputVector;
    __m128 invScalar = _mm_set_ps1(1.0/scalar);
    int16_t* inputPtr = (int16_t*)inputVector;
    __m128 ret;
    for(;number < quarterPoints; number++){
      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
      ret = _mm_mul_ps(ret, invScalar);
      _mm_storeu_ps(outputVectorPtr, ret);
      inputPtr += 4;
      outputVectorPtr += 4;
    }
    number = quarterPoints * 4;
    for(; number < num_points; number++){
      outputVector[number] = (float)(inputVector[number]) / scalar;
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
    \param inputVector The 16 bit input data buffer
    \param outputVector The floating point output data buffer
    \param scalar The value divided against each point in the output buffer
    \param num_points The number of data values to be converted
    \note Output buffer does NOT need to be properly aligned
  */
 static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
  float* outputVectorPtr = outputVector;
  const int16_t* inputVectorPtr = inputVector;
  unsigned int number = 0;
  for(number = 0; number < num_points; number++){
    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
  }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */
 #ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
 #define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
 #include <inttypes.h>
 #include <stdio.h>
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
  /*!
    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
    \param inputVector The 16 bit input data buffer
    \param outputVector The floating point output data buffer
    \param scalar The value divided against each point in the output buffer
    \param num_points The number of data values to be converted
  */
 static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int eighthPoints = num_points / 8;
     float* outputVectorPtr = outputVector;
    __m128 invScalar = _mm_set_ps1(1.0/scalar);
    int16_t* inputPtr = (int16_t*)inputVector;
    __m128i inputVal;
    __m128i inputVal2;
    __m128 ret;
    for(;number < eighthPoints; number++){
      // Load the 8 values
      inputVal = _mm_loadu_si128((__m128i*)inputPtr);
      // Shift the input data to the right by 64 bits ( 8 bytes )
      inputVal2 = _mm_srli_si128(inputVal, 8);
      // Convert the lower 4 values into 32 bit words
      inputVal = _mm_cvtepi16_epi32(inputVal);
      inputVal2 = _mm_cvtepi16_epi32(inputVal2);
      ret = _mm_cvtepi32_ps(inputVal);
      ret = _mm_mul_ps(ret, invScalar);
      _mm_storeu_ps(outputVectorPtr, ret);
      outputVectorPtr += 4;
      ret = _mm_cvtepi32_ps(inputVal2);
      ret = _mm_mul_ps(ret, invScalar);
      _mm_storeu_ps(outputVectorPtr, ret);
      outputVectorPtr += 4;
      inputPtr += 8;
    }
    number = eighthPoints * 8;
    for(; number < num_points; number++){
      outputVector[number] =((float)(inputVector[number])) / scalar;
    }
 }
 #endif /* LV_HAVE_SSE4_1 */
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
  /*!
    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
    \param inputVector The 16 bit input data buffer
    \param outputVector The floating point output data buffer
    \param scalar The value divided against each point in the output buffer
    \param num_points The number of data values to be converted
  */
 static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    float* outputVectorPtr = outputVector;
    __m128 invScalar = _mm_set_ps1(1.0/scalar);
    int16_t* inputPtr = (int16_t*)inputVector;
    __m128 ret;
    for(;number < quarterPoints; number++){
      ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
      ret = _mm_mul_ps(ret, invScalar);
      _mm_storeu_ps(outputVectorPtr, ret);
      inputPtr += 4;
      outputVectorPtr += 4;
    }
    number = quarterPoints * 4;
    for(; number < num_points; number++){
      outputVector[number] = (float)(inputVector[number]) / scalar;
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
    \param inputVector The 16 bit input data buffer
    \param outputVector The floating point output data buffer
    \param scalar The value divided against each point in the output buffer
    \param num_points The number of data values to be converted
  */
 static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
  float* outputVectorPtr = outputVector;
  const int16_t* inputVectorPtr = inputVector;
  unsigned int number = 0;
  for(number = 0; number < num_points; number++){
    *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
  }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h
@@ -1,68 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
 #define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
 #include <volk_gnsssdr/volk_gnsssdr_common.h>
 #include <inttypes.h>
 #include <stdio.h>
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
 /*!
  \brief Accumulates the values in the input buffer
  \param result The accumulated result
  \param inputBuffer The buffer of data to be accumulated
  \param num_points The number of values in inputBuffer to be accumulated
 */
 static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){
  float returnValue = 0;
  unsigned int number = 0;
  const unsigned int quarterPoints = num_points / 4;
  const float* aPtr = inputBuffer;
  __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
  __m128 accumulator = _mm_setzero_ps();
  __m128 aVal = _mm_setzero_ps();
  for(;number < quarterPoints; number++){
    aVal = _mm_load_ps(aPtr);
    accumulator = _mm_add_ps(accumulator, aVal);
    aPtr += 4;
  }
  _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
  returnValue = tempBuffer[0];
  returnValue += tempBuffer[1];
  returnValue += tempBuffer[2];
  returnValue += tempBuffer[3];
  number = quarterPoints * 4;
  for(;number < num_points; number++){
    returnValue += (*aPtr++);
  }
  *result = returnValue;
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
 /*!
  \brief Accumulates the values in the input buffer
  \param result The accumulated result
  \param inputBuffer The buffer of data to be accumulated
  \param num_points The number of values in inputBuffer to be accumulated
 */
 static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){
  const float* aPtr = inputBuffer;
  unsigned int number = 0;
  float returnValue = 0;
  for(;number < num_points; number++){
    returnValue += (*aPtr++);
  }
  *result = returnValue;
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h
@@ -1,149 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
 #define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
 #include <volk_gnsssdr/volk_gnsssdr_common.h>
 #include <volk_gnsssdr/volk_gnsssdr_common.h>
 #include <inttypes.h>
 #include <stdio.h>
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
 static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
  if(num_points > 0){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    float* inputPtr = (float*)src0;
    __m128 indexIncrementValues = _mm_set1_ps(4);
    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
    float max = src0[0];
    float index = 0;
    __m128 maxValues = _mm_set1_ps(max);
    __m128 maxValuesIndex = _mm_setzero_ps();
    __m128 compareResults;
    __m128 currentValues;
    __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
    __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
    for(;number < quarterPoints; number++){
      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
      compareResults = _mm_cmpgt_ps(maxValues, currentValues);
      maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
      maxValues      = _mm_blendv_ps(currentValues, maxValues, compareResults);
    }
    // Calculate the largest value from the remaining 4 points
    _mm_store_ps(maxValuesBuffer, maxValues);
    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
    for(number = 0; number < 4; number++){
      if(maxValuesBuffer[number] > max){
 	index = maxIndexesBuffer[number];
 	max = maxValuesBuffer[number];
      }
    }
    number = quarterPoints * 4;
    for(;number < num_points; number++){
      if(src0[number] > max){
 	index = number;
 	max = src0[number];
      }
    }
    target[0] = (unsigned int)index;
  }
 }
 #endif /*LV_HAVE_SSE4_1*/
 #ifdef LV_HAVE_SSE
 #include<xmmintrin.h>
 static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) {
  if(num_points > 0){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    float* inputPtr = (float*)src0;
    __m128 indexIncrementValues = _mm_set1_ps(4);
    __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
    float max = src0[0];
    float index = 0;
    __m128 maxValues = _mm_set1_ps(max);
    __m128 maxValuesIndex = _mm_setzero_ps();
    __m128 compareResults;
    __m128 currentValues;
    __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
    __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
    for(;number < quarterPoints; number++){
      currentValues  = _mm_load_ps(inputPtr); inputPtr += 4;
      currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
      compareResults = _mm_cmpgt_ps(maxValues, currentValues);
      maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
      maxValues      = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
    }
    // Calculate the largest value from the remaining 4 points
    _mm_store_ps(maxValuesBuffer, maxValues);
    _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
    for(number = 0; number < 4; number++){
      if(maxValuesBuffer[number] > max){
 	index = maxIndexesBuffer[number];
 	max = maxValuesBuffer[number];
      }
    }
    number = quarterPoints * 4;
    for(;number < num_points; number++){
      if(src0[number] > max){
 	index = number;
 	max = src0[number];
      }
    }
    target[0] = (unsigned int)index;
  }
 }
 #endif /*LV_HAVE_SSE*/
 #ifdef LV_HAVE_GENERIC
 static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
  if(num_points > 0){
    float max = src0[0];
    unsigned int index = 0;
    unsigned int i = 1;
    for(; i < num_points; ++i) {
      if(src0[i] > max){
 	index = i;
 	max = src0[i];
      }
    }
    target[0] = index;
  }
 }
 #endif /*LV_HAVE_GENERIC*/
 #endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h
@@ -1,302 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
 #define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
 #include <inttypes.h>
 #include <stdio.h>
 #include <math.h>
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
  /*!
    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
    \param inputVector The floating point input data buffer
    \param outputVector The 16 bit output data buffer
    \param scalar The value multiplied against each point in the input buffer
    \param num_points The number of data values to be converted
    \note Input buffer does NOT need to be properly aligned
  */
 static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
  unsigned int number = 0;
  const unsigned int eighthPoints = num_points / 8;
  const float* inputVectorPtr = (const float*)inputVector;
  int16_t* outputVectorPtr = outputVector;
  float min_val = -32768;
  float max_val = 32767;
  float r;
  __m128 vScalar = _mm_set_ps1(scalar);
  __m128 inputVal1, inputVal2;
  __m128i intInputVal1, intInputVal2;
  __m128 ret1, ret2;
  __m128 vmin_val = _mm_set_ps1(min_val);
  __m128 vmax_val = _mm_set_ps1(max_val);
  for(;number < eighthPoints; number++){
    inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
    inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
    // Scale and clip
    ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
    ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
    intInputVal1 = _mm_cvtps_epi32(ret1);
    intInputVal2 = _mm_cvtps_epi32(ret2);
    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
    outputVectorPtr += 8;
  }
  number = eighthPoints * 8;
  for(; number < num_points; number++){
    r = inputVector[number] * scalar;
    if(r > max_val)
      r = max_val;
    else if(r < min_val)
      r = min_val;
    outputVector[number] = (int16_t)rintf(r);
  }
 }
 #endif /* LV_HAVE_SSE2 */
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
  /*!
    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
    \param inputVector The floating point input data buffer
    \param outputVector The 16 bit output data buffer
    \param scalar The value multiplied against each point in the input buffer
    \param num_points The number of data values to be converted
    \note Input buffer does NOT need to be properly aligned
  */
 static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
  unsigned int number = 0;
  const unsigned int quarterPoints = num_points / 4;
  const float* inputVectorPtr = (const float*)inputVector;
  int16_t* outputVectorPtr = outputVector;
  float min_val = -32768;
  float max_val = 32767;
  float r;
  __m128 vScalar = _mm_set_ps1(scalar);
  __m128 ret;
  __m128 vmin_val = _mm_set_ps1(min_val);
  __m128 vmax_val = _mm_set_ps1(max_val);
  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
  for(;number < quarterPoints; number++){
    ret = _mm_loadu_ps(inputVectorPtr);
    inputVectorPtr += 4;
    // Scale and clip
    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
    _mm_store_ps(outputFloatBuffer, ret);
    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
  }
  number = quarterPoints * 4;
  for(; number < num_points; number++){
    r = inputVector[number] * scalar;
    if(r > max_val)
      r = max_val;
    else if(r < min_val)
      r = min_val;
    outputVector[number] = (int16_t)rintf(r);
  }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
    \param inputVector The floating point input data buffer
    \param outputVector The 16 bit output data buffer
    \param scalar The value multiplied against each point in the input buffer
    \param num_points The number of data values to be converted
    \note Input buffer does NOT need to be properly aligned
  */
 static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
  int16_t* outputVectorPtr = outputVector;
  const float* inputVectorPtr = inputVector;
  unsigned int number = 0;
  float min_val = -32768;
  float max_val = 32767;
  float r;
  for(number = 0; number < num_points; number++){
    r = *inputVectorPtr++  * scalar;
    if(r > max_val)
      r = max_val;
    else if(r < min_val)
      r = min_val;
    *outputVectorPtr++ = (int16_t)rintf(r);
  }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */
 #ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
 #define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
 #include <volk/volk_common.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <math.h>
 #ifdef LV_HAVE_SSE2
 #include <emmintrin.h>
  /*!
    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
    \param inputVector The floating point input data buffer
    \param outputVector The 16 bit output data buffer
    \param scalar The value multiplied against each point in the input buffer
    \param num_points The number of data values to be converted
  */
 static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
  unsigned int number = 0;
  const unsigned int eighthPoints = num_points / 8;
  const float* inputVectorPtr = (const float*)inputVector;
  int16_t* outputVectorPtr = outputVector;
  float min_val = -32768;
  float max_val = 32767;
  float r;
  __m128 vScalar = _mm_set_ps1(scalar);
  __m128 inputVal1, inputVal2;
  __m128i intInputVal1, intInputVal2;
  __m128 ret1, ret2;
  __m128 vmin_val = _mm_set_ps1(min_val);
  __m128 vmax_val = _mm_set_ps1(max_val);
  for(;number < eighthPoints; number++){
    inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
    inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
    // Scale and clip
    ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
    ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
    intInputVal1 = _mm_cvtps_epi32(ret1);
    intInputVal2 = _mm_cvtps_epi32(ret2);
    intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
    _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
    outputVectorPtr += 8;
  }
  number = eighthPoints * 8;
  for(; number < num_points; number++){
    r = inputVector[number] * scalar;
    if(r > max_val)
      r = max_val;
    else if(r < min_val)
      r = min_val;
    outputVector[number] = (int16_t)rintf(r);
  }
 }
 #endif /* LV_HAVE_SSE2 */
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
  /*!
    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
    \param inputVector The floating point input data buffer
    \param outputVector The 16 bit output data buffer
    \param scalar The value multiplied against each point in the input buffer
    \param num_points The number of data values to be converted
  */
 static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
  unsigned int number = 0;
  const unsigned int quarterPoints = num_points / 4;
  const float* inputVectorPtr = (const float*)inputVector;
  int16_t* outputVectorPtr = outputVector;
  float min_val = -32768;
  float max_val = 32767;
  float r;
  __m128 vScalar = _mm_set_ps1(scalar);
  __m128 ret;
  __m128 vmin_val = _mm_set_ps1(min_val);
  __m128 vmax_val = _mm_set_ps1(max_val);
  __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
  for(;number < quarterPoints; number++){
    ret = _mm_load_ps(inputVectorPtr);
    inputVectorPtr += 4;
    // Scale and clip
    ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
    _mm_store_ps(outputFloatBuffer, ret);
    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
    *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
  }
  number = quarterPoints * 4;
  for(; number < num_points; number++){
    r = inputVector[number] * scalar;
    if(r > max_val)
      r = max_val;
    else if(r < min_val)
      r = min_val;
    outputVector[number] = (int16_t)rintf(r);
  }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
    \param inputVector The floating point input data buffer
    \param outputVector The 16 bit output data buffer
    \param scalar The value multiplied against each point in the input buffer
    \param num_points The number of data values to be converted
  */
 static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
  int16_t* outputVectorPtr = outputVector;
  const float* inputVectorPtr = inputVector;
  unsigned int number = 0;
  float min_val = -32768;
  float max_val = 32767;
  float r;
  for(number = 0; number < num_points; number++){
    r  = *inputVectorPtr++ * scalar;
    if(r < min_val)
      r = min_val;
    else if(r > max_val)
      r = max_val;
    *outputVectorPtr++ = (int16_t)rintf(r);
  }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h
@@ -1,147 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
 #define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
 #include <inttypes.h>
 #include <stdio.h>
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
 /*!
  \brief Adds the two input vectors and store their results in the third vector
  \param cVector The vector where the results will be stored
  \param aVector One of the vectors to be added
  \param bVector One of the vectors to be added
  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
 */
 static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    float* cPtr = cVector;
    const float* aPtr = aVector;
    const float* bPtr=  bVector;
    __m128 aVal, bVal, cVal;
    for(;number < quarterPoints; number++){
      aVal = _mm_loadu_ps(aPtr);
      bVal = _mm_loadu_ps(bPtr);
      cVal = _mm_add_ps(aVal, bVal);
      _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
      aPtr += 4;
      bPtr += 4;
      cPtr += 4;
    }
    number = quarterPoints * 4;
    for(;number < num_points; number++){
      *cPtr++ = (*aPtr++) + (*bPtr++);
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
 /*!
  \brief Adds the two input vectors and store their results in the third vector
  \param cVector The vector where the results will be stored
  \param aVector One of the vectors to be added
  \param bVector One of the vectors to be added
  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
 */
 static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
    float* cPtr = cVector;
    const float* aPtr = aVector;
    const float* bPtr=  bVector;
    unsigned int number = 0;
    for(number = 0; number < num_points; number++){
      *cPtr++ = (*aPtr++) + (*bPtr++);
    }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */
 #ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
 #define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
 #include <inttypes.h>
 #include <stdio.h>
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
 /*!
  \brief Adds the two input vectors and store their results in the third vector
  \param cVector The vector where the results will be stored
  \param aVector One of the vectors to be added
  \param bVector One of the vectors to be added
  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
 */
 static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    float* cPtr = cVector;
    const float* aPtr = aVector;
    const float* bPtr=  bVector;
    __m128 aVal, bVal, cVal;
    for(;number < quarterPoints; number++){
      aVal = _mm_load_ps(aPtr);
      bVal = _mm_load_ps(bPtr);
      cVal = _mm_add_ps(aVal, bVal);
      _mm_store_ps(cPtr,cVal); // Store the results back into the C container
      aPtr += 4;
      bPtr += 4;
      cPtr += 4;
    }
    number = quarterPoints * 4;
    for(;number < num_points; number++){
      *cPtr++ = (*aPtr++) + (*bPtr++);
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
 /*!
  \brief Adds the two input vectors and store their results in the third vector
  \param cVector The vector where the results will be stored
  \param aVector One of the vectors to be added
  \param bVector One of the vectors to be added
  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
 */
 static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
    float* cPtr = cVector;
    const float* aPtr = aVector;
    const float* bPtr=  bVector;
    unsigned int number = 0;
    for(number = 0; number < num_points; number++){
      *cPtr++ = (*aPtr++) + (*bPtr++);
    }
 }
 #endif /* LV_HAVE_GENERIC */
 #ifdef LV_HAVE_ORC
 /*!
  \brief Adds the two input vectors and store their results in the third vector
  \param cVector The vector where the results will be stored
  \param aVector One of the vectors to be added
  \param bVector One of the vectors to be added
  \param num_points The number of values in aVector and bVector to be added together and stored into cVector
 */
 extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
 static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
    volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
 #endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h
@@ -1,127 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
 #define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
 #include <inttypes.h>
 #include <stdio.h>
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <float.h>
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
  /*!
    \brief Takes the conjugate of a complex vector.
    \param cVector The vector where the results will be stored
    \param aVector Vector to be conjugated
    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int halfPoints = num_points / 2;
    __m128 x;
    lv_32fc_t* c = cVector;
    const lv_32fc_t* a = aVector;
    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
    for(;number < halfPoints; number++){
      x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
      x = _mm_xor_ps(x, conjugator); // conjugate register
      _mm_storeu_ps((float*)c,x); // Store the results back into the C container
      a += 2;
      c += 2;
    }
    if((num_points % 2) != 0) {
      *c = lv_conj(*a);
    }
 }
 #endif /* LV_HAVE_SSE3 */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Takes the conjugate of a complex vector.
    \param cVector The vector where the results will be stored
    \param aVector Vector to be conjugated
    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
    lv_32fc_t* cPtr = cVector;
    const lv_32fc_t* aPtr = aVector;
    unsigned int number = 0;
    for(number = 0; number < num_points; number++){
      *cPtr++ = lv_conj(*aPtr++);
    }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */
 #ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
 #define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
 #include <inttypes.h>
 #include <stdio.h>
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <float.h>
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
  /*!
    \brief Takes the conjugate of a complex vector.
    \param cVector The vector where the results will be stored
    \param aVector Vector to be conjugated
    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int halfPoints = num_points / 2;
    __m128 x;
    lv_32fc_t* c = cVector;
    const lv_32fc_t* a = aVector;
    __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
    for(;number < halfPoints; number++){
      x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
      x = _mm_xor_ps(x, conjugator); // conjugate register
      _mm_store_ps((float*)c,x); // Store the results back into the C container
      a += 2;
      c += 2;
    }
    if((num_points % 2) != 0) {
      *c = lv_conj(*a);
    }
 }
 #endif /* LV_HAVE_SSE3 */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Takes the conjugate of a complex vector.
    \param cVector The vector where the results will be stored
    \param aVector Vector to be conjugated
    \param num_points The number of complex values in aVector to be conjugated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
    lv_32fc_t* cPtr = cVector;
    const lv_32fc_t* aPtr = aVector;
    unsigned int number = 0;
    for(number = 0; number < num_points; number++){
      *cPtr++ = lv_conj(*aPtr++);
    }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h
@@ -1,228 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
 #define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
 #include <inttypes.h>
 #include <stdio.h>
 #include <math.h>
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
  /*!
    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
    \param complexVector The vector containing the complex input values
    \param magnitudeVector The vector containing the real output values
    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    const float* complexVectorPtr = (float*)complexVector;
    float* magnitudeVectorPtr = magnitudeVector;
    __m128 cplxValue1, cplxValue2, result;
    for(;number < quarterPoints; number++){
      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
      complexVectorPtr += 4;
      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
      complexVectorPtr += 4;
      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
      _mm_storeu_ps(magnitudeVectorPtr, result);
      magnitudeVectorPtr += 4;
    }
    number = quarterPoints * 4;
    for(; number < num_points; number++){
      float val1Real = *complexVectorPtr++;
      float val1Imag = *complexVectorPtr++;
      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
    }
 }
 #endif /* LV_HAVE_SSE3 */
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
  /*!
    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
    \param complexVector The vector containing the complex input values
    \param magnitudeVector The vector containing the real output values
    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    const float* complexVectorPtr = (float*)complexVector;
    float* magnitudeVectorPtr = magnitudeVector;
    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
    for(;number < quarterPoints; number++){
      cplxValue1 = _mm_loadu_ps(complexVectorPtr);
      complexVectorPtr += 4;
      cplxValue2 = _mm_loadu_ps(complexVectorPtr);
      complexVectorPtr += 4;
      // Arrange in i1i2i3i4 format
      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
      // Arrange in q1q2q3q4 format
      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
      _mm_storeu_ps(magnitudeVectorPtr, result);
      magnitudeVectorPtr += 4;
    }
    number = quarterPoints * 4;
    for(; number < num_points; number++){
       float val1Real = *complexVectorPtr++;
       float val1Imag = *complexVectorPtr++;
      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
    \param complexVector The vector containing the complex input values
    \param magnitudeVector The vector containing the real output values
    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
  const float* complexVectorPtr = (float*)complexVector;
  float* magnitudeVectorPtr = magnitudeVector;
  unsigned int number = 0;
  for(number = 0; number < num_points; number++){
    const float real = *complexVectorPtr++;
    const float imag = *complexVectorPtr++;
    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
  }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
 #ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
 #define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
 #include <inttypes.h>
 #include <stdio.h>
 #include <math.h>
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
  /*!
    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
    \param complexVector The vector containing the complex input values
    \param magnitudeVector The vector containing the real output values
    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    const float* complexVectorPtr = (float*)complexVector;
    float* magnitudeVectorPtr = magnitudeVector;
    __m128 cplxValue1, cplxValue2, result;
    for(;number < quarterPoints; number++){
      cplxValue1 = _mm_load_ps(complexVectorPtr);
      complexVectorPtr += 4;
      cplxValue2 = _mm_load_ps(complexVectorPtr);
      complexVectorPtr += 4;
      cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
      cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
      result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
      _mm_store_ps(magnitudeVectorPtr, result);
      magnitudeVectorPtr += 4;
    }
    number = quarterPoints * 4;
    for(; number < num_points; number++){
      float val1Real = *complexVectorPtr++;
      float val1Imag = *complexVectorPtr++;
      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
    }
 }
 #endif /* LV_HAVE_SSE3 */
 #ifdef LV_HAVE_SSE
 #include <xmmintrin.h>
  /*!
    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
    \param complexVector The vector containing the complex input values
    \param magnitudeVector The vector containing the real output values
    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
    unsigned int number = 0;
    const unsigned int quarterPoints = num_points / 4;
    const float* complexVectorPtr = (float*)complexVector;
    float* magnitudeVectorPtr = magnitudeVector;
    __m128 cplxValue1, cplxValue2, iValue, qValue, result;
    for(;number < quarterPoints; number++){
      cplxValue1 = _mm_load_ps(complexVectorPtr);
      complexVectorPtr += 4;
      cplxValue2 = _mm_load_ps(complexVectorPtr);
      complexVectorPtr += 4;
      // Arrange in i1i2i3i4 format
      iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
      // Arrange in q1q2q3q4 format
      qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
      iValue = _mm_mul_ps(iValue, iValue); // Square the I values
      qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
      result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
      _mm_store_ps(magnitudeVectorPtr, result);
      magnitudeVectorPtr += 4;
    }
    number = quarterPoints * 4;
    for(; number < num_points; number++){
       float val1Real = *complexVectorPtr++;
       float val1Imag = *complexVectorPtr++;
      *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
    \param complexVector The vector containing the complex input values
    \param magnitudeVector The vector containing the real output values
    \param num_points The number of complex values in complexVector to be calculated and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
  const float* complexVectorPtr = (float*)complexVector;
  float* magnitudeVectorPtr = magnitudeVector;
  unsigned int number = 0;
  for(number = 0; number < num_points; number++){
    const float real = *complexVectorPtr++;
    const float imag = *complexVectorPtr++;
    *magnitudeVectorPtr++ = (real*real) + (imag*imag);
  }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h
@@ -99,7 +99,7 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_
    if (num_points%4!=0)
    {
        __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
-        _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
+        _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
        int associated_chip_index;
        float tcode_half_chips = tcode_half_chips_stored[0];
@@ -215,7 +215,7 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_
    if (num_points%4!=0)
    {
        __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
-        _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
+        _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
        int associated_chip_index;
        float tcode_half_chips = tcode_half_chips_stored[0];
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h
@@ -1,178 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
 #define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
 #include <inttypes.h>
 #include <stdio.h>
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <float.h>
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
 /*!
  \brief Multiplies the input vector by a scalar and stores the results in the third vector
  \param cVector The vector where the results will be stored
  \param aVector The vector to be multiplied
  \param scalar The complex scalar to multiply aVector
  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
 */
 static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
  unsigned int number = 0;
    const unsigned int halfPoints = num_points / 2;
    __m128 x, yl, yh, z, tmp1, tmp2;
    lv_32fc_t* c = cVector;
    const lv_32fc_t* a = aVector;
    // Set up constant scalar vector
    yl = _mm_set_ps1(lv_creal(scalar));
    yh = _mm_set_ps1(lv_cimag(scalar));
    for(;number < halfPoints; number++){
      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
      a += 2;
      c += 2;
    }
    if((num_points % 2) != 0) {
      *c = (*a) * scalar;
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
 /*!
  \brief Multiplies the input vector by a scalar and stores the results in the third vector
  \param cVector The vector where the results will be stored
  \param aVector The vector to be multiplied
  \param scalar The complex scalar to multiply aVector
  \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
 */
 static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
    lv_32fc_t* cPtr = cVector;
    const lv_32fc_t* aPtr = aVector;
    unsigned int number = num_points;
    // unwrap loop
    while (number >= 8){
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      number -= 8;
    }
    // clean up any remaining
    while (number-- > 0)
      *cPtr++ = *aPtr++ * scalar;
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
 #ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
 #define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
 #include <inttypes.h>
 #include <stdio.h>
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <float.h>
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
  /*!
    \brief Multiplies the two input complex vectors and stores their results in the third vector
    \param cVector The vector where the results will be stored
    \param aVector One of the vectors to be multiplied
    \param bVector One of the vectors to be multiplied
    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
  unsigned int number = 0;
    const unsigned int halfPoints = num_points / 2;
    __m128 x, yl, yh, z, tmp1, tmp2;
    lv_32fc_t* c = cVector;
    const lv_32fc_t* a = aVector;
    // Set up constant scalar vector
    yl = _mm_set_ps1(lv_creal(scalar));
    yh = _mm_set_ps1(lv_cimag(scalar));
    for(;number < halfPoints; number++){
      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
      _mm_store_ps((float*)c,z); // Store the results back into the C container
      a += 2;
      c += 2;
    }
    if((num_points % 2) != 0) {
      *c = (*a) * scalar;
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Multiplies the two input complex vectors and stores their results in the third vector
    \param cVector The vector where the results will be stored
    \param aVector One of the vectors to be multiplied
    \param bVector One of the vectors to be multiplied
    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
    lv_32fc_t* cPtr = cVector;
    const lv_32fc_t* aPtr = aVector;
    unsigned int number = num_points;
    // unwrap loop
    while (number >= 8){
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      *cPtr++ = (*aPtr++) * scalar;
      number -= 8;
    }
    // clean up any remaining
    while (number-- > 0)
      *cPtr++ = *aPtr++ * scalar;
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h
@@ -1,759 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
 #define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
 #include <volk_gnsssdr/volk_gnsssdr_common.h>
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <stdio.h>
 #include <string.h>
 #ifdef LV_HAVE_GENERIC
 static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
  float * res = (float*) result;
  float * in = (float*) input;
  float * tp = (float*) taps;
  unsigned int n_2_ccomplex_blocks = num_points/2;
  unsigned int isodd = num_points & 1;
  float sum0[2] = {0,0};
  float sum1[2] = {0,0};
  unsigned int i = 0;
  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
    in += 4;
    tp += 4;
  }
  res[0] = sum0[0] + sum1[0];
  res[1] = sum0[1] + sum1[1];
  // Cleanup if we had an odd number of points
  for(i = 0; i < isodd; ++i) {
    *result += input[num_points - 1] * taps[num_points - 1];
  }
 }
 #endif /*LV_HAVE_GENERIC*/
 #if LV_HAVE_SSE && LV_HAVE_64
 static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
  const unsigned int num_bytes = num_points*8;
  unsigned int isodd = num_points & 1;
  asm
    (
     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
     "#                         const float *taps, unsigned num_bytes)\n\t"
     "#    float sum0 = 0;\n\t"
     "#    float sum1 = 0;\n\t"
     "#    float sum2 = 0;\n\t"
     "#    float sum3 = 0;\n\t"
     "#    do {\n\t"
     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
     "#      input += 4;\n\t"
     "#      taps += 4;  \n\t"
     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
     "#    result[0] = sum0 + sum2;\n\t"
     "#    result[1] = sum1 + sum3;\n\t"
     "# TODO: prefetch and better scheduling\n\t"
     "  xor    %%r9,  %%r9\n\t"
     "  xor    %%r10, %%r10\n\t"
     "  movq   %%rcx, %%rax\n\t"
     "  movq   %%rcx, %%r8\n\t"
     "  movq   %[rsi],  %%r9\n\t"
     "  movq   %[rdx], %%r10\n\t"
     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
     "	movups	0(%%r9), %%xmm0\n\t"
     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
     "	movups	0(%%r10), %%xmm2\n\t"
     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
     "  shr     $4, %%r8\n\t"
     "	jmp	.%=L1_test\n\t"
     "	# 4 taps / loop\n\t"
     "	# something like ?? cycles / loop\n\t"
     ".%=Loop1:	\n\t"
     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
     "#	movups	(%%r9), %%xmmA\n\t"
     "#	movups	(%%r10), %%xmmB\n\t"
     "#	movups	%%xmmA, %%xmmZ\n\t"
     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
     "#	mulps	%%xmmB, %%xmmA\n\t"
     "#	mulps	%%xmmZ, %%xmmB\n\t"
     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
     "#	xorps	%%xmmPN, %%xmmA\n\t"
     "#	movups	%%xmmA, %%xmmZ\n\t"
     "#	unpcklps %%xmmB, %%xmmA\n\t"
     "#	unpckhps %%xmmB, %%xmmZ\n\t"
     "#	movups	%%xmmZ, %%xmmY\n\t"
     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
     "#	addps	%%xmmZ, %%xmmA\n\t"
     "#	addps	%%xmmA, %%xmmC\n\t"
     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
     "	movups	16(%%r9), %%xmm1\n\t"
     "	movups	%%xmm0, %%xmm4\n\t"
     "	mulps	%%xmm2, %%xmm0\n\t"
     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
     "	movups	16(%%r10), %%xmm3\n\t"
     "	movups	%%xmm1, %%xmm5\n\t"
     "	addps	%%xmm0, %%xmm6\n\t"
     "	mulps	%%xmm3, %%xmm1\n\t"
     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
     "	addps	%%xmm1, %%xmm6\n\t"
     "	mulps	%%xmm4, %%xmm2\n\t"
     "	movups	32(%%r9), %%xmm0\n\t"
     "	addps	%%xmm2, %%xmm7\n\t"
     "	mulps	%%xmm5, %%xmm3\n\t"
     "	add	$32, %%r9\n\t"
     "	movups	32(%%r10), %%xmm2\n\t"
     "	addps	%%xmm3, %%xmm7\n\t"
     "	add	$32, %%r10\n\t"
     ".%=L1_test:\n\t"
     "	dec	%%rax\n\t"
     "	jge	.%=Loop1\n\t"
     "	# We've handled the bulk of multiplies up to here.\n\t"
     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
     "	# If so, we've got 2 more taps to do.\n\t"
     "	and	$1, %%r8\n\t"
     "	je	.%=Leven\n\t"
     "	# The count was odd, do 2 more taps.\n\t"
     "	# Note that we've already got mm0/mm2 preloaded\n\t"
     "	# from the main loop.\n\t"
     "	movups	%%xmm0, %%xmm4\n\t"
     "	mulps	%%xmm2, %%xmm0\n\t"
     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
     "	addps	%%xmm0, %%xmm6\n\t"
     "	mulps	%%xmm4, %%xmm2\n\t"
     "	addps	%%xmm2, %%xmm7\n\t"
     ".%=Leven:\n\t"
     "	# neg inversor\n\t"
     "	xorps	%%xmm1, %%xmm1\n\t"
     "	mov	$0x80000000, %%r9\n\t"
     "	movd	%%r9, %%xmm1\n\t"
     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
     "	# pfpnacc\n\t"
     "	xorps	%%xmm1, %%xmm6\n\t"
     "	movups	%%xmm6, %%xmm2\n\t"
     "	unpcklps %%xmm7, %%xmm6\n\t"
     "	unpckhps %%xmm7, %%xmm2\n\t"
     "	movups	%%xmm2, %%xmm3\n\t"
     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
     "	addps	%%xmm2, %%xmm6\n\t"
     "					# xmm6 = r1 i2 r3 i4\n\t"
     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
     :
     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
     :"rax", "r8", "r9", "r10"
     );
  if(isodd) {
    *result += input[num_points - 1] * taps[num_points - 1];
  }
  return;
 }
 #endif /* LV_HAVE_SSE && LV_HAVE_64 */
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
 static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
  lv_32fc_t dotProduct;
  memset(&dotProduct, 0x0, 2*sizeof(float));
  unsigned int number = 0;
  const unsigned int halfPoints = num_points/2;
  unsigned int isodd = num_points & 1;
  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
  const lv_32fc_t* a = input;
  const lv_32fc_t* b = taps;
  dotProdVal = _mm_setzero_ps();
  for(;number < halfPoints; number++){
    x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
    y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
    a += 2;
    b += 2;
  }
  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
  _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
  if(isodd) {
    dotProduct += input[num_points - 1] * taps[num_points - 1];
  }
  *result = dotProduct;
 }
 #endif /*LV_HAVE_SSE3*/
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
 static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
  unsigned int i = 0;
  const unsigned int qtr_points = num_points/4;
  const unsigned int isodd = num_points & 3;
  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
  float *p_input, *p_taps;
  __m64 *p_result;
  p_result = (__m64*)result;
  p_input = (float*)input;
  p_taps = (float*)taps;
  static const __m128i neg = {0x000000000000000080000000};
  real0 = _mm_setzero_ps();
  real1 = _mm_setzero_ps();
  im0 = _mm_setzero_ps();
  im1 = _mm_setzero_ps();
  for(; i < qtr_points; ++i) {
    xmm0 = _mm_loadu_ps(p_input);
    xmm1 = _mm_loadu_ps(p_taps);
    p_input += 4;
    p_taps += 4;
    xmm2 = _mm_loadu_ps(p_input);
    xmm3 = _mm_loadu_ps(p_taps);
    p_input += 4;
    p_taps += 4;
    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
    //imaginary vector from input
    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
    //real vector from input
    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
    //imaginary vector from taps
    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
    //real vector from taps
    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
    real0 = _mm_add_ps(xmm4, real0);
    real1 = _mm_add_ps(xmm5, real1);
    im0 = _mm_add_ps(xmm6, im0);
    im1 = _mm_add_ps(xmm7, im1);
  }
  real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
  im0 = _mm_add_ps(im0, im1);
  real0 = _mm_add_ps(real0, real1);
  im0 = _mm_add_ps(im0, real0);
  _mm_storel_pi(p_result, im0);
  for(i = num_points-isodd; i < num_points; i++) {
    *result += input[i] * taps[i];
  }
 }
 #endif /*LV_HAVE_SSE4_1*/
 #endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/
 #ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
 #define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
 #include <volk_gnsssdr/volk_gnsssdr_common.h>
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <stdio.h>
 #include <string.h>
 #ifdef LV_HAVE_GENERIC
 static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
  const unsigned int num_bytes = num_points*8;
  float * res = (float*) result;
  float * in = (float*) input;
  float * tp = (float*) taps;
  unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
  unsigned int isodd = num_points & 1;
  float sum0[2] = {0,0};
  float sum1[2] = {0,0};
  unsigned int i = 0;
  for(i = 0; i < n_2_ccomplex_blocks; ++i) {
    sum0[0] += in[0] * tp[0] - in[1] * tp[1];
    sum0[1] += in[0] * tp[1] + in[1] * tp[0];
    sum1[0] += in[2] * tp[2] - in[3] * tp[3];
    sum1[1] += in[2] * tp[3] + in[3] * tp[2];
    in += 4;
    tp += 4;
  }
  res[0] = sum0[0] + sum1[0];
  res[1] = sum0[1] + sum1[1];
  for(i = 0; i < isodd; ++i) {
    *result += input[num_points - 1] * taps[num_points - 1];
  }
 }
 #endif /*LV_HAVE_GENERIC*/
 #if LV_HAVE_SSE && LV_HAVE_64
 static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
  const unsigned int num_bytes = num_points*8;
  unsigned int isodd = num_points & 1;
  asm
    (
     "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t"
     "#                         const float *taps, unsigned num_bytes)\n\t"
     "#    float sum0 = 0;\n\t"
     "#    float sum1 = 0;\n\t"
     "#    float sum2 = 0;\n\t"
     "#    float sum3 = 0;\n\t"
     "#    do {\n\t"
     "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
     "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
     "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
     "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
     "#      input += 4;\n\t"
     "#      taps += 4;  \n\t"
     "#    } while (--n_2_ccomplex_blocks != 0);\n\t"
     "#    result[0] = sum0 + sum2;\n\t"
     "#    result[1] = sum1 + sum3;\n\t"
     "# TODO: prefetch and better scheduling\n\t"
     "  xor    %%r9,  %%r9\n\t"
     "  xor    %%r10, %%r10\n\t"
     "  movq   %%rcx, %%rax\n\t"
     "  movq   %%rcx, %%r8\n\t"
     "  movq   %[rsi],  %%r9\n\t"
     "  movq   %[rdx], %%r10\n\t"
     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
     "	movaps	0(%%r9), %%xmm0\n\t"
     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
     "	movaps	0(%%r10), %%xmm2\n\t"
     "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t"
     "  shr     $4, %%r8\n\t"
     "	jmp	.%=L1_test\n\t"
     "	# 4 taps / loop\n\t"
     "	# something like ?? cycles / loop\n\t"
     ".%=Loop1:	\n\t"
     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
     "#	movaps	(%%r9), %%xmmA\n\t"
     "#	movaps	(%%r10), %%xmmB\n\t"
     "#	movaps	%%xmmA, %%xmmZ\n\t"
     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
     "#	mulps	%%xmmB, %%xmmA\n\t"
     "#	mulps	%%xmmZ, %%xmmB\n\t"
     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
     "#	xorps	%%xmmPN, %%xmmA\n\t"
     "#	movaps	%%xmmA, %%xmmZ\n\t"
     "#	unpcklps %%xmmB, %%xmmA\n\t"
     "#	unpckhps %%xmmB, %%xmmZ\n\t"
     "#	movaps	%%xmmZ, %%xmmY\n\t"
     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
     "#	addps	%%xmmZ, %%xmmA\n\t"
     "#	addps	%%xmmA, %%xmmC\n\t"
     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
     "	movaps	16(%%r9), %%xmm1\n\t"
     "	movaps	%%xmm0, %%xmm4\n\t"
     "	mulps	%%xmm2, %%xmm0\n\t"
     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
     "	movaps	16(%%r10), %%xmm3\n\t"
     "	movaps	%%xmm1, %%xmm5\n\t"
     "	addps	%%xmm0, %%xmm6\n\t"
     "	mulps	%%xmm3, %%xmm1\n\t"
     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
     "	addps	%%xmm1, %%xmm6\n\t"
     "	mulps	%%xmm4, %%xmm2\n\t"
     "	movaps	32(%%r9), %%xmm0\n\t"
     "	addps	%%xmm2, %%xmm7\n\t"
     "	mulps	%%xmm5, %%xmm3\n\t"
     "	add	$32, %%r9\n\t"
     "	movaps	32(%%r10), %%xmm2\n\t"
     "	addps	%%xmm3, %%xmm7\n\t"
     "	add	$32, %%r10\n\t"
     ".%=L1_test:\n\t"
     "	dec	%%rax\n\t"
     "	jge	.%=Loop1\n\t"
     "	# We've handled the bulk of multiplies up to here.\n\t"
     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
     "	# If so, we've got 2 more taps to do.\n\t"
     "	and	$1, %%r8\n\t"
     "	je	.%=Leven\n\t"
     "	# The count was odd, do 2 more taps.\n\t"
     "	# Note that we've already got mm0/mm2 preloaded\n\t"
     "	# from the main loop.\n\t"
     "	movaps	%%xmm0, %%xmm4\n\t"
     "	mulps	%%xmm2, %%xmm0\n\t"
     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
     "	addps	%%xmm0, %%xmm6\n\t"
     "	mulps	%%xmm4, %%xmm2\n\t"
     "	addps	%%xmm2, %%xmm7\n\t"
     ".%=Leven:\n\t"
     "	# neg inversor\n\t"
     "	xorps	%%xmm1, %%xmm1\n\t"
     "	mov	$0x80000000, %%r9\n\t"
     "	movd	%%r9, %%xmm1\n\t"
     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
     "	# pfpnacc\n\t"
     "	xorps	%%xmm1, %%xmm6\n\t"
     "	movaps	%%xmm6, %%xmm2\n\t"
     "	unpcklps %%xmm7, %%xmm6\n\t"
     "	unpckhps %%xmm7, %%xmm2\n\t"
     "	movaps	%%xmm2, %%xmm3\n\t"
     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
     "	addps	%%xmm2, %%xmm6\n\t"
     "					# xmm6 = r1 i2 r3 i4\n\t"
     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
     "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t"
     :
     :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
     :"rax", "r8", "r9", "r10"
     );
  if(isodd) {
    *result += input[num_points - 1] * taps[num_points - 1];
  }
  return;
 }
 #endif
 #if LV_HAVE_SSE && LV_HAVE_32
 static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
  volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
 #if 0
  const unsigned int num_bytes = num_points*8;
  unsigned int isodd = num_points & 1;
  asm volatile
    (
     "	#pushl	%%ebp\n\t"
     "	#movl	%%esp, %%ebp\n\t"
     "	movl	12(%%ebp), %%eax		# input\n\t"
     "	movl	16(%%ebp), %%edx		# taps\n\t"
     "	movl	20(%%ebp), %%ecx                # n_bytes\n\t"
     "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t"
     "	movaps	0(%%eax), %%xmm0\n\t"
     "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t"
     "	movaps	0(%%edx), %%xmm2\n\t"
     "	shrl	$5, %%ecx		# ecx = n_2_ccomplex_blocks / 2\n\t"
     "	jmp	.%=L1_test\n\t"
     "	# 4 taps / loop\n\t"
     "	# something like ?? cycles / loop\n\t"
     ".%=Loop1:	\n\t"
     "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
     "#	movaps	(%%eax), %%xmmA\n\t"
     "#	movaps	(%%edx), %%xmmB\n\t"
     "#	movaps	%%xmmA, %%xmmZ\n\t"
     "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t"
     "#	mulps	%%xmmB, %%xmmA\n\t"
     "#	mulps	%%xmmZ, %%xmmB\n\t"
     "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
     "#	xorps	%%xmmPN, %%xmmA\n\t"
     "#	movaps	%%xmmA, %%xmmZ\n\t"
     "#	unpcklps %%xmmB, %%xmmA\n\t"
     "#	unpckhps %%xmmB, %%xmmZ\n\t"
     "#	movaps	%%xmmZ, %%xmmY\n\t"
     "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t"
     "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t"
     "#	addps	%%xmmZ, %%xmmA\n\t"
     "#	addps	%%xmmA, %%xmmC\n\t"
     "# A=xmm0, B=xmm2, Z=xmm4\n\t"
     "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
     "	movaps	16(%%eax), %%xmm1\n\t"
     "	movaps	%%xmm0, %%xmm4\n\t"
     "	mulps	%%xmm2, %%xmm0\n\t"
     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
     "	movaps	16(%%edx), %%xmm3\n\t"
     "	movaps	%%xmm1, %%xmm5\n\t"
     "	addps	%%xmm0, %%xmm6\n\t"
     "	mulps	%%xmm3, %%xmm1\n\t"
     "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t"
     "	addps	%%xmm1, %%xmm6\n\t"
     "	mulps	%%xmm4, %%xmm2\n\t"
     "	movaps	32(%%eax), %%xmm0\n\t"
     "	addps	%%xmm2, %%xmm7\n\t"
     "	mulps	%%xmm5, %%xmm3\n\t"
     "	addl	$32, %%eax\n\t"
     "	movaps	32(%%edx), %%xmm2\n\t"
     "	addps	%%xmm3, %%xmm7\n\t"
     "	addl	$32, %%edx\n\t"
     ".%=L1_test:\n\t"
     "	decl	%%ecx\n\t"
     "	jge	.%=Loop1\n\t"
     "	# We've handled the bulk of multiplies up to here.\n\t"
     "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
     "	# If so, we've got 2 more taps to do.\n\t"
     "	movl	20(%%ebp), %%ecx		# n_2_ccomplex_blocks\n\t"
     "  shrl    $4, %%ecx\n\t"
     "	andl	$1, %%ecx\n\t"
     "	je	.%=Leven\n\t"
     "	# The count was odd, do 2 more taps.\n\t"
     "	# Note that we've already got mm0/mm2 preloaded\n\t"
     "	# from the main loop.\n\t"
     "	movaps	%%xmm0, %%xmm4\n\t"
     "	mulps	%%xmm2, %%xmm0\n\t"
     "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t"
     "	addps	%%xmm0, %%xmm6\n\t"
     "	mulps	%%xmm4, %%xmm2\n\t"
     "	addps	%%xmm2, %%xmm7\n\t"
     ".%=Leven:\n\t"
     "	# neg inversor\n\t"
     "  movl 8(%%ebp), %%eax \n\t"
     "	xorps	%%xmm1, %%xmm1\n\t"
     "  movl	$0x80000000, (%%eax)\n\t"
     "	movss	(%%eax), %%xmm1\n\t"
     "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t"
     "	# pfpnacc\n\t"
     "	xorps	%%xmm1, %%xmm6\n\t"
     "	movaps	%%xmm6, %%xmm2\n\t"
     "	unpcklps %%xmm7, %%xmm6\n\t"
     "	unpckhps %%xmm7, %%xmm2\n\t"
     "	movaps	%%xmm2, %%xmm3\n\t"
     "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t"
     "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t"
     "	addps	%%xmm2, %%xmm6\n\t"
     "					# xmm6 = r1 i2 r3 i4\n\t"
     "	#movl	8(%%ebp), %%eax		# @result\n\t"
     "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t"
     "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t"
     "	movlps	%%xmm6, (%%eax)		# store low 2x32 bits (complex) to memory\n\t"
     "	#popl	%%ebp\n\t"
     :
     :
     : "eax", "ecx", "edx"
     );
  int getem = num_bytes % 16;
  if(isodd) {
    *result += (input[num_points - 1] * taps[num_points - 1]);
  }
  return;
 #endif
 }
 #endif /*LV_HAVE_SSE*/
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
 static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
  const unsigned int num_bytes = num_points*8;
  unsigned int isodd = num_points & 1;
  lv_32fc_t dotProduct;
  memset(&dotProduct, 0x0, 2*sizeof(float));
  unsigned int number = 0;
  const unsigned int halfPoints = num_bytes >> 4;
  __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
  const lv_32fc_t* a = input;
  const lv_32fc_t* b = taps;
  dotProdVal = _mm_setzero_ps();
  for(;number < halfPoints; number++){
    x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
    y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
    yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
    yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
    tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
    x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
    tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
    z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
    dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
    a += 2;
    b += 2;
  }
  __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
  _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
  dotProduct += ( dotProductVector[0] + dotProductVector[1] );
  if(isodd) {
    dotProduct += input[num_points - 1] * taps[num_points - 1];
  }
  *result = dotProduct;
 }
 #endif /*LV_HAVE_SSE3*/
 #ifdef LV_HAVE_SSE4_1
 #include <smmintrin.h>
 static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
  unsigned int i = 0;
  const unsigned int qtr_points = num_points/4;
  const unsigned int isodd = num_points & 3;
  __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
  float *p_input, *p_taps;
  __m64 *p_result;
  static const __m128i neg = {0x000000000000000080000000};
  p_result = (__m64*)result;
  p_input = (float*)input;
  p_taps = (float*)taps;
  real0 = _mm_setzero_ps();
  real1 = _mm_setzero_ps();
  im0 = _mm_setzero_ps();
  im1 = _mm_setzero_ps();
  for(; i < qtr_points; ++i) {
    xmm0 = _mm_load_ps(p_input);
    xmm1 = _mm_load_ps(p_taps);
    p_input += 4;
    p_taps += 4;
    xmm2 = _mm_load_ps(p_input);
    xmm3 = _mm_load_ps(p_taps);
    p_input += 4;
    p_taps += 4;
    xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
    xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
    xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
    xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
    //imaginary vector from input
    xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
    //real vector from input
    xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
    //imaginary vector from taps
    xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
    //real vector from taps
    xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
    xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
    xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
    xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
    xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
    real0 = _mm_add_ps(xmm4, real0);
    real1 = _mm_add_ps(xmm5, real1);
    im0 = _mm_add_ps(xmm6, im0);
    im1 = _mm_add_ps(xmm7, im1);
  }
  real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
  im0 = _mm_add_ps(im0, im1);
  real0 = _mm_add_ps(real0, real1);
  im0 = _mm_add_ps(im0, real0);
  _mm_storel_pi(p_result, im0);
  for(i = num_points-isodd; i < num_points; i++) {
    *result += input[i] * taps[i];
  }
 }
 #endif /*LV_HAVE_SSE4_1*/
 #endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h
@@ -1,170 +0,0 @@
 #ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
 #define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
 #include <inttypes.h>
 #include <stdio.h>
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <float.h>
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
  /*!
    \brief Multiplies the two input complex vectors and stores their results in the third vector
    \param cVector The vector where the results will be stored
    \param aVector One of the vectors to be multiplied
    \param bVector One of the vectors to be multiplied
    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
  unsigned int number = 0;
    const unsigned int halfPoints = num_points / 2;
    __m128 x, y, yl, yh, z, tmp1, tmp2;
    lv_32fc_t* c = cVector;
    const lv_32fc_t* a = aVector;
    const lv_32fc_t* b = bVector;
    for(;number < halfPoints; number++){
      x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
      y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
      _mm_storeu_ps((float*)c,z); // Store the results back into the C container
      a += 2;
      b += 2;
      c += 2;
    }
    if((num_points % 2) != 0) {
      *c = (*a) * (*b);
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Multiplies the two input complex vectors and stores their results in the third vector
    \param cVector The vector where the results will be stored
    \param aVector One of the vectors to be multiplied
    \param bVector One of the vectors to be multiplied
    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
    lv_32fc_t* cPtr = cVector;
    const lv_32fc_t* aPtr = aVector;
    const lv_32fc_t* bPtr=  bVector;
    unsigned int number = 0;
    for(number = 0; number < num_points; number++){
      *cPtr++ = (*aPtr++) * (*bPtr++);
    }
 }
 #endif /* LV_HAVE_GENERIC */
 #endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
 #ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
 #define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
 #include <inttypes.h>
 #include <stdio.h>
 #include <volk_gnsssdr/volk_gnsssdr_complex.h>
 #include <float.h>
 #ifdef LV_HAVE_SSE3
 #include <pmmintrin.h>
  /*!
    \brief Multiplies the two input complex vectors and stores their results in the third vector
    \param cVector The vector where the results will be stored
    \param aVector One of the vectors to be multiplied
    \param bVector One of the vectors to be multiplied
    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
  unsigned int number = 0;
    const unsigned int halfPoints = num_points / 2;
    __m128 x, y, yl, yh, z, tmp1, tmp2;
    lv_32fc_t* c = cVector;
    const lv_32fc_t* a = aVector;
    const lv_32fc_t* b = bVector;
    for(;number < halfPoints; number++){
      x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
      y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
      yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
      yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
      tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
      x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
      tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
      z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
      _mm_store_ps((float*)c,z); // Store the results back into the C container
      a += 2;
      b += 2;
      c += 2;
    }
    if((num_points % 2) != 0) {
      *c = (*a) * (*b);
    }
 }
 #endif /* LV_HAVE_SSE */
 #ifdef LV_HAVE_GENERIC
  /*!
    \brief Multiplies the two input complex vectors and stores their results in the third vector
    \param cVector The vector where the results will be stored
    \param aVector One of the vectors to be multiplied
    \param bVector One of the vectors to be multiplied
    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
  */
 static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
    lv_32fc_t* cPtr = cVector;
    const lv_32fc_t* aPtr = aVector;
    const lv_32fc_t* bPtr=  bVector;
    unsigned int number = 0;
    for(number = 0; number < num_points; number++){
      *cPtr++ = (*aPtr++) * (*bPtr++);
    }
 }
 #endif /* LV_HAVE_GENERIC */
 #ifdef LV_HAVE_ORC
  /*!
    \brief Multiplies the two input complex vectors and stores their results in the third vector
    \param cVector The vector where the results will be stored
    \param aVector One of the vectors to be multiplied
    \param bVector One of the vectors to be multiplied
    \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
  */
 extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
 static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
    volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
 }
 #endif /* LV_HAVE_ORC */
 #endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
--- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h
+++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h
@@ -1,3 +1,49 @@
 /*!
 * \file volk_gnsssdr_32fc_x5_cw_vepl_corr_32fc_x5
 * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt and Late correlation with 64 bits vectors
 * \authors <ul>
 *          <li>Javier Arribas, 2011. jarribas(at)cttc.es
 *          <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
 *          </ul>
 *
 * Volk protokernel that performs the carrier wipe-off mixing and the
 * Early, Prompt and Late correlation with 64 bits vectors (32 bits the
 * real part and 32 bits the imaginary part):
 * - The carrier wipe-off is done by multiplying the input signal by the
 * carrier (multiplication of 64 bits vectors) It returns the input
 * signal in base band (BB)
 * - Early values are calculated by multiplying the input signal in BB by the
 * early code (multiplication of 64 bits vectors), accumulating the results
 * - Prompt values are calculated by multiplying the input signal in BB by the
 * prompt code (multiplication of 64 bits vectors), accumulating the results
 * - Late values are calculated by multiplying the input signal in BB by the
 * late code (multiplication of 64 bits vectors), accumulating the results
 *
 * -------------------------------------------------------------------------
 *
 * Copyright (C) 2010-2014  (see AUTHORS file for a list of contributors)
 *
 * GNSS-SDR is a software defined Global Navigation
 *          Satellite Systems receiver
 *
 * This file is part of GNSS-SDR.
 *
 * GNSS-SDR is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * at your option) any later version.
 *
 * GNSS-SDR is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
 *
 * -------------------------------------------------------------------------
 */
 #ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
 #define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
--- a/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc
+++ b/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc
@@ -24,17 +24,6 @@
 #include <volk_gnsssdr/volk_gnsssdr.h>
 #include <boost/test/unit_test.hpp>
 //VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE
 VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1);
 //GNSS-SDR PROTO-KERNELS
 VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1);
@@ -52,7 +41,6 @@ VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
 VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1);
--- a/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.cc
@@ -244,7 +244,6 @@ void galileo_volk_e1_dll_pll_veml_tracking_cc::update_local_code()
 {
    double tcode_half_chips;
    float rem_code_phase_half_chips;
    int associated_chip_index;
    int code_length_half_chips = static_cast<int>(Galileo_E1_B_CODE_LENGTH_CHIPS) * 2;
    double code_phase_step_chips;
    double code_phase_step_half_chips;