From db738c7ea9215deebb61311b3c4f5892953375d4 Mon Sep 17 00:00:00 2001 From: andres Date: Mon, 27 Oct 2014 05:19:26 +0100 Subject: [PATCH] Added cmake version check. Deleted original volk Added cmake version checks. Deleted original volk protokernels. Fixes. --- CMakeLists.txt | 8 + .../volk_gnsssdr/apps/volk_gnsssdr_profile.cc | 9 - .../volk_gnsssdr_16i_s32f_convert_32f.h | 241 ------ .../volk_gnsssdr_32f_accumulator_s32f.h | 68 -- .../volk_gnsssdr_32f_index_max_16u.h | 149 ---- .../volk_gnsssdr_32f_s32f_convert_16i.h | 302 ------- .../volk_gnsssdr_32f_x2_add_32f.h | 147 ---- .../volk_gnsssdr_32fc_conjugate_32fc.h | 127 --- .../volk_gnsssdr_32fc_magnitude_squared_32f.h | 228 ------ ...ssdr_32fc_s32f_x4_update_local_code_32fc.h | 4 +- .../volk_gnsssdr_32fc_s32fc_multiply_32fc.h | 178 ---- .../volk_gnsssdr_32fc_x2_dot_prod_32fc.h | 759 ------------------ .../volk_gnsssdr_32fc_x2_multiply_32fc.h | 170 ---- ...volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h | 46 ++ .../libs/volk_gnsssdr/lib/testqa.cc | 12 - ...alileo_volk_e1_dll_pll_veml_tracking_cc.cc | 1 - 16 files changed, 56 insertions(+), 2393 deletions(-) delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h delete mode 100644 src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h diff --git a/CMakeLists.txt b/CMakeLists.txt index df5885282..a23f9a02c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -205,6 +205,14 @@ set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "") # Append -O2 optimization flag for Debug builds set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O2") +################################################################################ +# Checkout cmake version +################################################################################ +if(CMAKE_VERSION VERSION_LESS 2.8.8) + message(STATUS "Your CMake version is too old and does not support some features required by GNSS-SDR. CMake version must be at least 2.8.8. For more information check https://github.com/joakimkarlsson/bandit/issues/40") + message(FATAL_ERROR "Fatal error: CMake >= 2.8.8 required.") +endif(CMAKE_VERSION VERSION_LESS 2.8.8) + ################################################################################ # Checkout compiler version ################################################################################ diff --git a/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc b/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc index 7980b25ad..2e3ab4b23 100644 --- a/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc +++ b/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc @@ -179,27 +179,18 @@ int main(int argc, char *argv[]) { VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex); VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); - VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); // Until we can update the config on a kernel by kernel basis // do not overwrite volk_config when using a regex. diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h deleted file mode 100644 index ccb13171c..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h +++ /dev/null @@ -1,241 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H -#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - \note Output buffer does NOT need to be properly aligned - */ -static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */ -#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H -#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H - -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int eighthPoints = num_points / 8; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128i inputVal; - __m128i inputVal2; - __m128 ret; - - for(;number < eighthPoints; number++){ - - // Load the 8 values - inputVal = _mm_loadu_si128((__m128i*)inputPtr); - - // Shift the input data to the right by 64 bits ( 8 bytes ) - inputVal2 = _mm_srli_si128(inputVal, 8); - - // Convert the lower 4 values into 32 bit words - inputVal = _mm_cvtepi16_epi32(inputVal); - inputVal2 = _mm_cvtepi16_epi32(inputVal2); - - ret = _mm_cvtepi32_ps(inputVal); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - outputVectorPtr += 4; - - ret = _mm_cvtepi32_ps(inputVal2); - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - outputVectorPtr += 4; - - inputPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - outputVector[number] =((float)(inputVector[number])) / scalar; - } -} -#endif /* LV_HAVE_SSE4_1 */ - -#ifdef LV_HAVE_SSE -#include - - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* outputVectorPtr = outputVector; - __m128 invScalar = _mm_set_ps1(1.0/scalar); - int16_t* inputPtr = (int16_t*)inputVector; - __m128 ret; - - for(;number < quarterPoints; number++){ - ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); - - ret = _mm_mul_ps(ret, invScalar); - _mm_storeu_ps(outputVectorPtr, ret); - - inputPtr += 4; - outputVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - outputVector[number] = (float)(inputVector[number]) / scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value - \param inputVector The 16 bit input data buffer - \param outputVector The floating point output data buffer - \param scalar The value divided against each point in the output buffer - \param num_points The number of data values to be converted - */ -static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ - float* outputVectorPtr = outputVector; - const int16_t* inputVectorPtr = inputVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h deleted file mode 100644 index 82f1b3efd..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h +++ /dev/null @@ -1,68 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H -#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){ - float returnValue = 0; - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; - - __m128 accumulator = _mm_setzero_ps(); - __m128 aVal = _mm_setzero_ps(); - - for(;number < quarterPoints; number++){ - aVal = _mm_load_ps(aPtr); - accumulator = _mm_add_ps(accumulator, aVal); - aPtr += 4; - } - _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container - returnValue = tempBuffer[0]; - returnValue += tempBuffer[1]; - returnValue += tempBuffer[2]; - returnValue += tempBuffer[3]; - - number = quarterPoints * 4; - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Accumulates the values in the input buffer - \param result The accumulated result - \param inputBuffer The buffer of data to be accumulated - \param num_points The number of values in inputBuffer to be accumulated -*/ -static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){ - const float* aPtr = inputBuffer; - unsigned int number = 0; - float returnValue = 0; - - for(;number < num_points; number++){ - returnValue += (*aPtr++); - } - *result = returnValue; -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h deleted file mode 100644 index eaaf6d1de..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h +++ /dev/null @@ -1,149 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H -#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE4_1 -#include - -static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE4_1*/ - -#ifdef LV_HAVE_SSE -#include - -static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* inputPtr = (float*)src0; - - __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); - - float max = src0[0]; - float index = 0; - __m128 maxValues = _mm_set1_ps(max); - __m128 maxValuesIndex = _mm_setzero_ps(); - __m128 compareResults; - __m128 currentValues; - - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; - - for(;number < quarterPoints; number++){ - - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; - currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); - - compareResults = _mm_cmpgt_ps(maxValues, currentValues); - - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); - - maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); - } - - // Calculate the largest value from the remaining 4 points - _mm_store_ps(maxValuesBuffer, maxValues); - _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - - for(number = 0; number < 4; number++){ - if(maxValuesBuffer[number] > max){ - index = maxIndexesBuffer[number]; - max = maxValuesBuffer[number]; - } - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - if(src0[number] > max){ - index = number; - max = src0[number]; - } - } - target[0] = (unsigned int)index; - } -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) { - if(num_points > 0){ - float max = src0[0]; - unsigned int index = 0; - - unsigned int i = 1; - - for(; i < num_points; ++i) { - - if(src0[i] > max){ - index = i; - max = src0[i]; - } - - } - target[0] = index; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h deleted file mode 100644 index cd83cef9d..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h +++ /dev/null @@ -1,302 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H -#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = -32768; - float max_val = 32767; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - __m128 ret1, ret2; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; - - // Scale and clip - ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = -32768; - float max_val = 32767; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_loadu_ps(inputVectorPtr); - inputVectorPtr += 4; - - // Scale and clip - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - \note Input buffer does NOT need to be properly aligned - */ -static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = -32768; - float max_val = 32767; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - *outputVectorPtr++ = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */ -#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H -#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE2 -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int eighthPoints = num_points / 8; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = -32768; - float max_val = 32767; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; - __m128 ret1, ret2; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - for(;number < eighthPoints; number++){ - inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; - - // Scale and clip - ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); - ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); - - intInputVal1 = _mm_cvtps_epi32(ret1); - intInputVal2 = _mm_cvtps_epi32(ret2); - - intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); - - _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); - outputVectorPtr += 8; - } - - number = eighthPoints * 8; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_SSE2 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - unsigned int number = 0; - - const unsigned int quarterPoints = num_points / 4; - - const float* inputVectorPtr = (const float*)inputVector; - int16_t* outputVectorPtr = outputVector; - - float min_val = -32768; - float max_val = 32767; - float r; - - __m128 vScalar = _mm_set_ps1(scalar); - __m128 ret; - __m128 vmin_val = _mm_set_ps1(min_val); - __m128 vmax_val = _mm_set_ps1(max_val); - - __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; - - for(;number < quarterPoints; number++){ - ret = _mm_load_ps(inputVectorPtr); - inputVectorPtr += 4; - - // Scale and clip - ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); - - _mm_store_ps(outputFloatBuffer, ret); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); - *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - r = inputVector[number] * scalar; - if(r > max_val) - r = max_val; - else if(r < min_val) - r = min_val; - outputVector[number] = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value - \param inputVector The floating point input data buffer - \param outputVector The 16 bit output data buffer - \param scalar The value multiplied against each point in the input buffer - \param num_points The number of data values to be converted - */ -static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ - int16_t* outputVectorPtr = outputVector; - const float* inputVectorPtr = inputVector; - unsigned int number = 0; - float min_val = -32768; - float max_val = 32767; - float r; - - for(number = 0; number < num_points; number++){ - r = *inputVectorPtr++ * scalar; - if(r < min_val) - r = min_val; - else if(r > max_val) - r = max_val; - *outputVectorPtr++ = (int16_t)rintf(r); - } -} -#endif /* LV_HAVE_GENERIC */ - - - - -#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h deleted file mode 100644 index ee647b2d7..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h +++ /dev/null @@ -1,147 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H -#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_loadu_ps(aPtr); - bVal = _mm_loadu_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */ -#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H -#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H - -#include -#include - -#ifdef LV_HAVE_SSE -#include -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - - __m128 aVal, bVal, cVal; - for(;number < quarterPoints; number++){ - - aVal = _mm_load_ps(aPtr); - bVal = _mm_load_ps(bPtr); - - cVal = _mm_add_ps(aVal, bVal); - - _mm_store_ps(cPtr,cVal); // Store the results back into the C container - - aPtr += 4; - bPtr += 4; - cPtr += 4; - } - - number = quarterPoints * 4; - for(;number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - float* cPtr = cVector; - const float* aPtr = aVector; - const float* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) + (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC -/*! - \brief Adds the two input vectors and store their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be added - \param bVector One of the vectors to be added - \param num_points The number of values in aVector and bVector to be added together and stored into cVector -*/ -extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); -static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ - volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - -#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h deleted file mode 100644 index a3b8848aa..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h +++ /dev/null @@ -1,127 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H -#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Takes the conjugate of a complex vector. - \param cVector The vector where the results will be stored - \param aVector Vector to be conjugated - \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi - - x = _mm_xor_ps(x, conjugator); // conjugate register - - _mm_storeu_ps((float*)c,x); // Store the results back into the C container - - a += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = lv_conj(*a); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Takes the conjugate of a complex vector. - \param cVector The vector where the results will be stored - \param aVector Vector to be conjugated - \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_conj(*aPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */ -#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H -#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Takes the conjugate of a complex vector. - \param cVector The vector where the results will be stored - \param aVector Vector to be conjugated - \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi - - x = _mm_xor_ps(x, conjugator); // conjugate register - - _mm_store_ps((float*)c,x); // Store the results back into the C container - - a += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = lv_conj(*a); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Takes the conjugate of a complex vector. - \param cVector The vector where the results will be stored - \param aVector Vector to be conjugated - \param num_points The number of complex values in aVector to be conjugated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = lv_conj(*aPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h deleted file mode 100644 index ce28f866e..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h +++ /dev/null @@ -1,228 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H -#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_loadu_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - _mm_storeu_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ -#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H -#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H - -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - - result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE3 */ - -#ifdef LV_HAVE_SSE -#include - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int quarterPoints = num_points / 4; - - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - - __m128 cplxValue1, cplxValue2, iValue, qValue, result; - for(;number < quarterPoints; number++){ - cplxValue1 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - cplxValue2 = _mm_load_ps(complexVectorPtr); - complexVectorPtr += 4; - - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - - result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values - - _mm_store_ps(magnitudeVectorPtr, result); - magnitudeVectorPtr += 4; - } - - number = quarterPoints * 4; - for(; number < num_points; number++){ - float val1Real = *complexVectorPtr++; - float val1Imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector - \param complexVector The vector containing the complex input values - \param magnitudeVector The vector containing the real output values - \param num_points The number of complex values in complexVector to be calculated and stored into cVector - */ -static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ - const float* complexVectorPtr = (float*)complexVector; - float* magnitudeVectorPtr = magnitudeVector; - unsigned int number = 0; - for(number = 0; number < num_points; number++){ - const float real = *complexVectorPtr++; - const float imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); - } -} -#endif /* LV_HAVE_GENERIC */ - -#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h index 0b5761176..de01b0d06 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h @@ -99,7 +99,7 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_ if (num_points%4!=0) { __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; - _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); + _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array); int associated_chip_index; float tcode_half_chips = tcode_half_chips_stored[0]; @@ -215,7 +215,7 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_ if (num_points%4!=0) { __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; - _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); + _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array); int associated_chip_index; float tcode_half_chips = tcode_half_chips_stored[0]; diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h deleted file mode 100644 index d5135d89f..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h +++ /dev/null @@ -1,178 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H -#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include -/*! - \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm_set_ps1(lv_creal(scalar)); - yh = _mm_set_ps1(lv_cimag(scalar)); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_storeu_ps((float*)c,z); // Store the results back into the C container - - a += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * scalar; - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC -/*! - \brief Multiplies the input vector by a scalar and stores the results in the third vector - \param cVector The vector where the results will be stored - \param aVector The vector to be multiplied - \param scalar The complex scalar to multiply aVector - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector -*/ -static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = num_points; - - // unwrap loop - while (number >= 8){ - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - number -= 8; - } - - // clean up any remaining - while (number-- > 0) - *cPtr++ = *aPtr++ * scalar; -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ -#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H -#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - - // Set up constant scalar vector - yl = _mm_set_ps1(lv_creal(scalar)); - yh = _mm_set_ps1(lv_cimag(scalar)); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * scalar; - } -} -#endif /* LV_HAVE_SSE */ - - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - unsigned int number = num_points; - - // unwrap loop - while (number >= 8){ - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - *cPtr++ = (*aPtr++) * scalar; - number -= 8; - } - - // clean up any remaining - while (number-- > 0) - *cPtr++ = *aPtr++ * scalar; -} -#endif /* LV_HAVE_GENERIC */ - - - - - -#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h deleted file mode 100644 index 521719699..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h +++ /dev/null @@ -1,759 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H -#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H - -#include -#include -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_points/2; - unsigned int isodd = num_points & 1; - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - in += 4; - tp += 4; - } - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - // Cleanup if we had an odd number of points - for(i = 0; i < isodd; ++i) { - *result += input[num_points - 1] * taps[num_points - 1]; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - - -#if LV_HAVE_SSE && LV_HAVE_64 - -static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - const unsigned int num_bytes = num_points*8; - unsigned int isodd = num_points & 1; - - asm - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movups 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movups 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movups (%%r9), %%xmmA\n\t" - "# movups (%%r10), %%xmmB\n\t" - "# movups %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movups %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movups %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movups 16(%%r9), %%xmm1\n\t" - " movups %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movups 16(%%r10), %%xmm3\n\t" - " movups %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movups 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movups 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movups %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movups %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movups %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - if(isodd) { - *result += input[num_points - 1] * taps[num_points - 1]; - } - - return; - -} - -#endif /* LV_HAVE_SSE && LV_HAVE_64 */ - - - - -#ifdef LV_HAVE_SSE3 -#include - -static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_points/2; - unsigned int isodd = num_points & 1; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; - - _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if(isodd) { - dotProduct += input[num_points - 1] * taps[num_points - 1]; - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 -#include - -static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - unsigned int i = 0; - const unsigned int qtr_points = num_points/4; - const unsigned int isodd = num_points & 3; - - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; - - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; - - static const __m128i neg = {0x000000000000000080000000}; - - real0 = _mm_setzero_ps(); - real1 = _mm_setzero_ps(); - im0 = _mm_setzero_ps(); - im1 = _mm_setzero_ps(); - - for(; i < qtr_points; ++i) { - xmm0 = _mm_loadu_ps(p_input); - xmm1 = _mm_loadu_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm2 = _mm_loadu_ps(p_input); - xmm3 = _mm_loadu_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - } - - real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); - - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); - - im0 = _mm_add_ps(im0, real0); - - _mm_storel_pi(p_result, im0); - - for(i = num_points-isodd; i < num_points; i++) { - *result += input[i] * taps[i]; - } -} - -#endif /*LV_HAVE_SSE4_1*/ - - - - -#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/ -#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H -#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H - -#include -#include -#include -#include - - -#ifdef LV_HAVE_GENERIC - - -static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - const unsigned int num_bytes = num_points*8; - - float * res = (float*) result; - float * in = (float*) input; - float * tp = (float*) taps; - unsigned int n_2_ccomplex_blocks = num_bytes >> 4; - unsigned int isodd = num_points & 1; - - float sum0[2] = {0,0}; - float sum1[2] = {0,0}; - unsigned int i = 0; - - for(i = 0; i < n_2_ccomplex_blocks; ++i) { - sum0[0] += in[0] * tp[0] - in[1] * tp[1]; - sum0[1] += in[0] * tp[1] + in[1] * tp[0]; - sum1[0] += in[2] * tp[2] - in[3] * tp[3]; - sum1[1] += in[2] * tp[3] + in[3] * tp[2]; - - in += 4; - tp += 4; - } - - res[0] = sum0[0] + sum1[0]; - res[1] = sum0[1] + sum1[1]; - - for(i = 0; i < isodd; ++i) { - *result += input[num_points - 1] * taps[num_points - 1]; - } -} - -#endif /*LV_HAVE_GENERIC*/ - - -#if LV_HAVE_SSE && LV_HAVE_64 - - -static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - const unsigned int num_bytes = num_points*8; - unsigned int isodd = num_points & 1; - - asm - ( - "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" - "# const float *taps, unsigned num_bytes)\n\t" - "# float sum0 = 0;\n\t" - "# float sum1 = 0;\n\t" - "# float sum2 = 0;\n\t" - "# float sum3 = 0;\n\t" - "# do {\n\t" - "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" - "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" - "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" - "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" - "# input += 4;\n\t" - "# taps += 4; \n\t" - "# } while (--n_2_ccomplex_blocks != 0);\n\t" - "# result[0] = sum0 + sum2;\n\t" - "# result[1] = sum1 + sum3;\n\t" - "# TODO: prefetch and better scheduling\n\t" - " xor %%r9, %%r9\n\t" - " xor %%r10, %%r10\n\t" - " movq %%rcx, %%rax\n\t" - " movq %%rcx, %%r8\n\t" - " movq %[rsi], %%r9\n\t" - " movq %[rdx], %%r10\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%r9), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%r10), %%xmm2\n\t" - " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" - " shr $4, %%r8\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%r9), %%xmmA\n\t" - "# movaps (%%r10), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%r9), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%r10), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%r9), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " add $32, %%r9\n\t" - " movaps 32(%%r10), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " add $32, %%r10\n\t" - ".%=L1_test:\n\t" - " dec %%rax\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " and $1, %%r8\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " xorps %%xmm1, %%xmm1\n\t" - " mov $0x80000000, %%r9\n\t" - " movd %%r9, %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" - : - :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) - :"rax", "r8", "r9", "r10" - ); - - - if(isodd) { - *result += input[num_points - 1] * taps[num_points - 1]; - } - - return; - -} - -#endif - -#if LV_HAVE_SSE && LV_HAVE_32 - -static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); - -#if 0 - const unsigned int num_bytes = num_points*8; - unsigned int isodd = num_points & 1; - - asm volatile - ( - " #pushl %%ebp\n\t" - " #movl %%esp, %%ebp\n\t" - " movl 12(%%ebp), %%eax # input\n\t" - " movl 16(%%ebp), %%edx # taps\n\t" - " movl 20(%%ebp), %%ecx # n_bytes\n\t" - " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" - " movaps 0(%%eax), %%xmm0\n\t" - " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" - " movaps 0(%%edx), %%xmm2\n\t" - " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" - " jmp .%=L1_test\n\t" - " # 4 taps / loop\n\t" - " # something like ?? cycles / loop\n\t" - ".%=Loop1: \n\t" - "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" - "# movaps (%%eax), %%xmmA\n\t" - "# movaps (%%edx), %%xmmB\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" - "# mulps %%xmmB, %%xmmA\n\t" - "# mulps %%xmmZ, %%xmmB\n\t" - "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" - "# xorps %%xmmPN, %%xmmA\n\t" - "# movaps %%xmmA, %%xmmZ\n\t" - "# unpcklps %%xmmB, %%xmmA\n\t" - "# unpckhps %%xmmB, %%xmmZ\n\t" - "# movaps %%xmmZ, %%xmmY\n\t" - "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" - "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" - "# addps %%xmmZ, %%xmmA\n\t" - "# addps %%xmmA, %%xmmC\n\t" - "# A=xmm0, B=xmm2, Z=xmm4\n\t" - "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" - " movaps 16(%%eax), %%xmm1\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " movaps 16(%%edx), %%xmm3\n\t" - " movaps %%xmm1, %%xmm5\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm3, %%xmm1\n\t" - " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" - " addps %%xmm1, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " movaps 32(%%eax), %%xmm0\n\t" - " addps %%xmm2, %%xmm7\n\t" - " mulps %%xmm5, %%xmm3\n\t" - " addl $32, %%eax\n\t" - " movaps 32(%%edx), %%xmm2\n\t" - " addps %%xmm3, %%xmm7\n\t" - " addl $32, %%edx\n\t" - ".%=L1_test:\n\t" - " decl %%ecx\n\t" - " jge .%=Loop1\n\t" - " # We've handled the bulk of multiplies up to here.\n\t" - " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" - " # If so, we've got 2 more taps to do.\n\t" - " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" - " shrl $4, %%ecx\n\t" - " andl $1, %%ecx\n\t" - " je .%=Leven\n\t" - " # The count was odd, do 2 more taps.\n\t" - " # Note that we've already got mm0/mm2 preloaded\n\t" - " # from the main loop.\n\t" - " movaps %%xmm0, %%xmm4\n\t" - " mulps %%xmm2, %%xmm0\n\t" - " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" - " addps %%xmm0, %%xmm6\n\t" - " mulps %%xmm4, %%xmm2\n\t" - " addps %%xmm2, %%xmm7\n\t" - ".%=Leven:\n\t" - " # neg inversor\n\t" - " movl 8(%%ebp), %%eax \n\t" - " xorps %%xmm1, %%xmm1\n\t" - " movl $0x80000000, (%%eax)\n\t" - " movss (%%eax), %%xmm1\n\t" - " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" - " # pfpnacc\n\t" - " xorps %%xmm1, %%xmm6\n\t" - " movaps %%xmm6, %%xmm2\n\t" - " unpcklps %%xmm7, %%xmm6\n\t" - " unpckhps %%xmm7, %%xmm2\n\t" - " movaps %%xmm2, %%xmm3\n\t" - " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" - " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" - " addps %%xmm2, %%xmm6\n\t" - " # xmm6 = r1 i2 r3 i4\n\t" - " #movl 8(%%ebp), %%eax # @result\n\t" - " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" - " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" - " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" - " #popl %%ebp\n\t" - : - : - : "eax", "ecx", "edx" - ); - - - int getem = num_bytes % 16; - - if(isodd) { - *result += (input[num_points - 1] * taps[num_points - 1]); - } - - return; -#endif -} - -#endif /*LV_HAVE_SSE*/ - -#ifdef LV_HAVE_SSE3 -#include - -static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - const unsigned int num_bytes = num_points*8; - unsigned int isodd = num_points & 1; - - lv_32fc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(float)); - - unsigned int number = 0; - const unsigned int halfPoints = num_bytes >> 4; - - __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; - - const lv_32fc_t* a = input; - const lv_32fc_t* b = taps; - - dotProdVal = _mm_setzero_ps(); - - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together - - a += 2; - b += 2; - } - - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; - - _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector - - dotProduct += ( dotProductVector[0] + dotProductVector[1] ); - - if(isodd) { - dotProduct += input[num_points - 1] * taps[num_points - 1]; - } - - *result = dotProduct; -} - -#endif /*LV_HAVE_SSE3*/ - -#ifdef LV_HAVE_SSE4_1 -#include - -static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { - - unsigned int i = 0; - const unsigned int qtr_points = num_points/4; - const unsigned int isodd = num_points & 3; - - __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; - float *p_input, *p_taps; - __m64 *p_result; - - static const __m128i neg = {0x000000000000000080000000}; - - p_result = (__m64*)result; - p_input = (float*)input; - p_taps = (float*)taps; - - real0 = _mm_setzero_ps(); - real1 = _mm_setzero_ps(); - im0 = _mm_setzero_ps(); - im1 = _mm_setzero_ps(); - - for(; i < qtr_points; ++i) { - xmm0 = _mm_load_ps(p_input); - xmm1 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm2 = _mm_load_ps(p_input); - xmm3 = _mm_load_ps(p_taps); - - p_input += 4; - p_taps += 4; - - xmm4 = _mm_unpackhi_ps(xmm0, xmm2); - xmm5 = _mm_unpackhi_ps(xmm1, xmm3); - xmm0 = _mm_unpacklo_ps(xmm0, xmm2); - xmm2 = _mm_unpacklo_ps(xmm1, xmm3); - - //imaginary vector from input - xmm1 = _mm_unpackhi_ps(xmm0, xmm4); - //real vector from input - xmm3 = _mm_unpacklo_ps(xmm0, xmm4); - //imaginary vector from taps - xmm0 = _mm_unpackhi_ps(xmm2, xmm5); - //real vector from taps - xmm2 = _mm_unpacklo_ps(xmm2, xmm5); - - xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); - xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); - - xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); - xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); - - real0 = _mm_add_ps(xmm4, real0); - real1 = _mm_add_ps(xmm5, real1); - im0 = _mm_add_ps(xmm6, im0); - im1 = _mm_add_ps(xmm7, im1); - } - - real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); - - im0 = _mm_add_ps(im0, im1); - real0 = _mm_add_ps(real0, real1); - - im0 = _mm_add_ps(im0, real0); - - _mm_storel_pi(p_result, im0); - - for(i = num_points-isodd; i < num_points; i++) { - *result += input[i] * taps[i]; - } -} - -#endif /*LV_HAVE_SSE4_1*/ - -#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h deleted file mode 100644 index e2b17c401..000000000 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h +++ /dev/null @@ -1,170 +0,0 @@ -#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H -#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - - for(;number < halfPoints; number++){ - - x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_storeu_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ -#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H -#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H - -#include -#include -#include -#include - -#ifdef LV_HAVE_SSE3 -#include - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - unsigned int number = 0; - const unsigned int halfPoints = num_points / 2; - - __m128 x, y, yl, yh, z, tmp1, tmp2; - lv_32fc_t* c = cVector; - const lv_32fc_t* a = aVector; - const lv_32fc_t* b = bVector; - for(;number < halfPoints; number++){ - - x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi - y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di - - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - - tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - - x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br - - tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - - z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - - _mm_store_ps((float*)c,z); // Store the results back into the C container - - a += 2; - b += 2; - c += 2; - } - - if((num_points % 2) != 0) { - *c = (*a) * (*b); - } -} -#endif /* LV_HAVE_SSE */ - -#ifdef LV_HAVE_GENERIC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - lv_32fc_t* cPtr = cVector; - const lv_32fc_t* aPtr = aVector; - const lv_32fc_t* bPtr= bVector; - unsigned int number = 0; - - for(number = 0; number < num_points; number++){ - *cPtr++ = (*aPtr++) * (*bPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - -#ifdef LV_HAVE_ORC - /*! - \brief Multiplies the two input complex vectors and stores their results in the third vector - \param cVector The vector where the results will be stored - \param aVector One of the vectors to be multiplied - \param bVector One of the vectors to be multiplied - \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector - */ -extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); -static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ - volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); -} -#endif /* LV_HAVE_ORC */ - - - - - -#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h index 4a8ac96e9..bc1c1333a 100644 --- a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h @@ -1,3 +1,49 @@ +/*! + * \file volk_gnsssdr_32fc_x5_cw_vepl_corr_32fc_x5 + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt and Late correlation with 64 bits vectors + * \authors
    + *
  • Javier Arribas, 2011. jarribas(at)cttc.es + *
  • AndrĂ©s Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Early, Prompt and Late correlation with 64 bits vectors (32 bits the + * real part and 32 bits the imaginary part): + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 64 bits vectors) It returns the input + * signal in base band (BB) + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 64 bits vectors), accumulating the results + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 64 bits vectors), accumulating the results + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 64 bits vectors), accumulating the results + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + #ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H #define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H diff --git a/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc b/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc index 45397c87e..93b67ff57 100644 --- a/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc +++ b/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc @@ -24,17 +24,6 @@ #include #include -//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE -VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1); - //GNSS-SDR PROTO-KERNELS VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1); @@ -52,7 +41,6 @@ VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1); -VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1); diff --git a/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.cc index 585df0fda..468ec9a60 100644 --- a/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.cc +++ b/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.cc @@ -244,7 +244,6 @@ void galileo_volk_e1_dll_pll_veml_tracking_cc::update_local_code() { double tcode_half_chips; float rem_code_phase_half_chips; - int associated_chip_index; int code_length_half_chips = static_cast(Galileo_E1_B_CODE_LENGTH_CHIPS) * 2; double code_phase_step_chips; double code_phase_step_half_chips;