mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-31 15:23:04 +00:00 
			
		
		
		
	Added cmake version check. Deleted original volk
Added cmake version checks. Deleted original volk protokernels. Fixes.
This commit is contained in:
		| @@ -205,6 +205,14 @@ set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "") | |||||||
| # Append -O2 optimization flag for Debug builds | # Append -O2 optimization flag for Debug builds | ||||||
| set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O2") | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O2") | ||||||
|  |  | ||||||
|  | ################################################################################ | ||||||
|  | # Checkout cmake version | ||||||
|  | ################################################################################ | ||||||
|  | if(CMAKE_VERSION VERSION_LESS 2.8.8) | ||||||
|  |       message(STATUS "Your CMake version is too old and does not support some features required by GNSS-SDR. CMake version must be at least 2.8.8. For more information check https://github.com/joakimkarlsson/bandit/issues/40") | ||||||
|  |       message(FATAL_ERROR "Fatal error: CMake >= 2.8.8 required.") | ||||||
|  | endif(CMAKE_VERSION VERSION_LESS 2.8.8) | ||||||
|  |  | ||||||
| ################################################################################ | ################################################################################ | ||||||
| # Checkout compiler version | # Checkout compiler version | ||||||
| ################################################################################ | ################################################################################ | ||||||
|   | |||||||
| @@ -179,27 +179,18 @@ int main(int argc, char *argv[]) { | |||||||
|      VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex); | ||||||
|      |      | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); |  | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); |  | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); |  | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |  | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |  | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |  | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |  | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |  | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); |      VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); | ||||||
|      VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); |  | ||||||
|      |      | ||||||
|     // Until we can update the config on a kernel by kernel basis |     // Until we can update the config on a kernel by kernel basis | ||||||
|     // do not overwrite volk_config when using a regex. |     // do not overwrite volk_config when using a regex. | ||||||
|   | |||||||
| @@ -1,241 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE4_1 |  | ||||||
| #include <smmintrin.h> |  | ||||||
|  |  | ||||||
|   /*! |  | ||||||
|     \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value |  | ||||||
|     \param inputVector The 16 bit input data buffer |  | ||||||
|     \param outputVector The floating point output data buffer |  | ||||||
|     \param scalar The value divided against each point in the output buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|     \note Output buffer does NOT need to be properly aligned |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int eighthPoints = num_points / 8; |  | ||||||
|  |  | ||||||
|      float* outputVectorPtr = outputVector; |  | ||||||
|     __m128 invScalar = _mm_set_ps1(1.0/scalar); |  | ||||||
|     int16_t* inputPtr = (int16_t*)inputVector; |  | ||||||
|     __m128i inputVal; |  | ||||||
|     __m128i inputVal2; |  | ||||||
|     __m128 ret; |  | ||||||
|  |  | ||||||
|     for(;number < eighthPoints; number++){ |  | ||||||
|  |  | ||||||
|       // Load the 8 values |  | ||||||
|       inputVal = _mm_loadu_si128((__m128i*)inputPtr); |  | ||||||
|  |  | ||||||
|       // Shift the input data to the right by 64 bits ( 8 bytes ) |  | ||||||
|       inputVal2 = _mm_srli_si128(inputVal, 8); |  | ||||||
|  |  | ||||||
|       // Convert the lower 4 values into 32 bit words |  | ||||||
|       inputVal = _mm_cvtepi16_epi32(inputVal); |  | ||||||
|       inputVal2 = _mm_cvtepi16_epi32(inputVal2); |  | ||||||
|  |  | ||||||
|       ret = _mm_cvtepi32_ps(inputVal); |  | ||||||
|       ret = _mm_mul_ps(ret, invScalar); |  | ||||||
|       _mm_storeu_ps(outputVectorPtr, ret); |  | ||||||
|       outputVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       ret = _mm_cvtepi32_ps(inputVal2); |  | ||||||
|       ret = _mm_mul_ps(ret, invScalar); |  | ||||||
|       _mm_storeu_ps(outputVectorPtr, ret); |  | ||||||
|  |  | ||||||
|       outputVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       inputPtr += 8; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = eighthPoints * 8; |  | ||||||
|     for(; number < num_points; number++){ |  | ||||||
|       outputVector[number] =((float)(inputVector[number])) / scalar; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE4_1 */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include <xmmintrin.h> |  | ||||||
|  |  | ||||||
|   /*! |  | ||||||
|     \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value |  | ||||||
|     \param inputVector The 16 bit input data buffer |  | ||||||
|     \param outputVector The floating point output data buffer |  | ||||||
|     \param scalar The value divided against each point in the output buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|     \note Output buffer does NOT need to be properly aligned |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     float* outputVectorPtr = outputVector; |  | ||||||
|     __m128 invScalar = _mm_set_ps1(1.0/scalar); |  | ||||||
|     int16_t* inputPtr = (int16_t*)inputVector; |  | ||||||
|     __m128 ret; |  | ||||||
|  |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|       ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); |  | ||||||
|  |  | ||||||
|       ret = _mm_mul_ps(ret, invScalar); |  | ||||||
|       _mm_storeu_ps(outputVectorPtr, ret); |  | ||||||
|  |  | ||||||
|       inputPtr += 4; |  | ||||||
|       outputVectorPtr += 4; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(; number < num_points; number++){ |  | ||||||
|       outputVector[number] = (float)(inputVector[number]) / scalar; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value |  | ||||||
|     \param inputVector The 16 bit input data buffer |  | ||||||
|     \param outputVector The floating point output data buffer |  | ||||||
|     \param scalar The value divided against each point in the output buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|     \note Output buffer does NOT need to be properly aligned |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|   float* outputVectorPtr = outputVector; |  | ||||||
|   const int16_t* inputVectorPtr = inputVector; |  | ||||||
|   unsigned int number = 0; |  | ||||||
|  |  | ||||||
|   for(number = 0; number < num_points; number++){ |  | ||||||
|     *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */ |  | ||||||
| #ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE4_1 |  | ||||||
| #include <smmintrin.h> |  | ||||||
|  |  | ||||||
|   /*! |  | ||||||
|     \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value |  | ||||||
|     \param inputVector The 16 bit input data buffer |  | ||||||
|     \param outputVector The floating point output data buffer |  | ||||||
|     \param scalar The value divided against each point in the output buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int eighthPoints = num_points / 8; |  | ||||||
|  |  | ||||||
|      float* outputVectorPtr = outputVector; |  | ||||||
|     __m128 invScalar = _mm_set_ps1(1.0/scalar); |  | ||||||
|     int16_t* inputPtr = (int16_t*)inputVector; |  | ||||||
|     __m128i inputVal; |  | ||||||
|     __m128i inputVal2; |  | ||||||
|     __m128 ret; |  | ||||||
|  |  | ||||||
|     for(;number < eighthPoints; number++){ |  | ||||||
|  |  | ||||||
|       // Load the 8 values |  | ||||||
|       inputVal = _mm_loadu_si128((__m128i*)inputPtr); |  | ||||||
|  |  | ||||||
|       // Shift the input data to the right by 64 bits ( 8 bytes ) |  | ||||||
|       inputVal2 = _mm_srli_si128(inputVal, 8); |  | ||||||
|  |  | ||||||
|       // Convert the lower 4 values into 32 bit words |  | ||||||
|       inputVal = _mm_cvtepi16_epi32(inputVal); |  | ||||||
|       inputVal2 = _mm_cvtepi16_epi32(inputVal2); |  | ||||||
|  |  | ||||||
|       ret = _mm_cvtepi32_ps(inputVal); |  | ||||||
|       ret = _mm_mul_ps(ret, invScalar); |  | ||||||
|       _mm_storeu_ps(outputVectorPtr, ret); |  | ||||||
|       outputVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       ret = _mm_cvtepi32_ps(inputVal2); |  | ||||||
|       ret = _mm_mul_ps(ret, invScalar); |  | ||||||
|       _mm_storeu_ps(outputVectorPtr, ret); |  | ||||||
|  |  | ||||||
|       outputVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       inputPtr += 8; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = eighthPoints * 8; |  | ||||||
|     for(; number < num_points; number++){ |  | ||||||
|       outputVector[number] =((float)(inputVector[number])) / scalar; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE4_1 */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include <xmmintrin.h> |  | ||||||
|  |  | ||||||
|   /*! |  | ||||||
|     \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value |  | ||||||
|     \param inputVector The 16 bit input data buffer |  | ||||||
|     \param outputVector The floating point output data buffer |  | ||||||
|     \param scalar The value divided against each point in the output buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     float* outputVectorPtr = outputVector; |  | ||||||
|     __m128 invScalar = _mm_set_ps1(1.0/scalar); |  | ||||||
|     int16_t* inputPtr = (int16_t*)inputVector; |  | ||||||
|     __m128 ret; |  | ||||||
|  |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|       ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); |  | ||||||
|  |  | ||||||
|       ret = _mm_mul_ps(ret, invScalar); |  | ||||||
|       _mm_storeu_ps(outputVectorPtr, ret); |  | ||||||
|  |  | ||||||
|       inputPtr += 4; |  | ||||||
|       outputVectorPtr += 4; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(; number < num_points; number++){ |  | ||||||
|       outputVector[number] = (float)(inputVector[number]) / scalar; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value |  | ||||||
|     \param inputVector The 16 bit input data buffer |  | ||||||
|     \param outputVector The floating point output data buffer |  | ||||||
|     \param scalar The value divided against each point in the output buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|   float* outputVectorPtr = outputVector; |  | ||||||
|   const int16_t* inputVectorPtr = inputVector; |  | ||||||
|   unsigned int number = 0; |  | ||||||
|  |  | ||||||
|   for(number = 0; number < num_points; number++){ |  | ||||||
|     *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */ |  | ||||||
| @@ -1,68 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H |  | ||||||
|  |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_common.h> |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include <xmmintrin.h> |  | ||||||
| /*! |  | ||||||
|   \brief Accumulates the values in the input buffer |  | ||||||
|   \param result The accumulated result |  | ||||||
|   \param inputBuffer The buffer of data to be accumulated |  | ||||||
|   \param num_points The number of values in inputBuffer to be accumulated |  | ||||||
| */ |  | ||||||
| static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){ |  | ||||||
|   float returnValue = 0; |  | ||||||
|   unsigned int number = 0; |  | ||||||
|   const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|   const float* aPtr = inputBuffer; |  | ||||||
|   __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; |  | ||||||
|  |  | ||||||
|   __m128 accumulator = _mm_setzero_ps(); |  | ||||||
|   __m128 aVal = _mm_setzero_ps(); |  | ||||||
|  |  | ||||||
|   for(;number < quarterPoints; number++){ |  | ||||||
|     aVal = _mm_load_ps(aPtr); |  | ||||||
|     accumulator = _mm_add_ps(accumulator, aVal); |  | ||||||
|     aPtr += 4; |  | ||||||
|   } |  | ||||||
|   _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container |  | ||||||
|   returnValue = tempBuffer[0]; |  | ||||||
|   returnValue += tempBuffer[1]; |  | ||||||
|   returnValue += tempBuffer[2]; |  | ||||||
|   returnValue += tempBuffer[3]; |  | ||||||
|  |  | ||||||
|   number = quarterPoints * 4; |  | ||||||
|   for(;number < num_points; number++){ |  | ||||||
|     returnValue += (*aPtr++); |  | ||||||
|   } |  | ||||||
|   *result = returnValue; |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
| /*! |  | ||||||
|   \brief Accumulates the values in the input buffer |  | ||||||
|   \param result The accumulated result |  | ||||||
|   \param inputBuffer The buffer of data to be accumulated |  | ||||||
|   \param num_points The number of values in inputBuffer to be accumulated |  | ||||||
| */ |  | ||||||
| static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){ |  | ||||||
|   const float* aPtr = inputBuffer; |  | ||||||
|   unsigned int number = 0; |  | ||||||
|   float returnValue = 0; |  | ||||||
|  |  | ||||||
|   for(;number < num_points; number++){ |  | ||||||
|     returnValue += (*aPtr++); |  | ||||||
|   } |  | ||||||
|   *result = returnValue; |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */ |  | ||||||
| @@ -1,149 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H |  | ||||||
|  |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_common.h> |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_common.h> |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE4_1 |  | ||||||
| #include <smmintrin.h> |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { |  | ||||||
|   if(num_points > 0){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     float* inputPtr = (float*)src0; |  | ||||||
|  |  | ||||||
|     __m128 indexIncrementValues = _mm_set1_ps(4); |  | ||||||
|     __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); |  | ||||||
|  |  | ||||||
|     float max = src0[0]; |  | ||||||
|     float index = 0; |  | ||||||
|     __m128 maxValues = _mm_set1_ps(max); |  | ||||||
|     __m128 maxValuesIndex = _mm_setzero_ps(); |  | ||||||
|     __m128 compareResults; |  | ||||||
|     __m128 currentValues; |  | ||||||
|  |  | ||||||
|     __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; |  | ||||||
|     __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; |  | ||||||
|  |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|  |  | ||||||
|       currentValues  = _mm_load_ps(inputPtr); inputPtr += 4; |  | ||||||
|       currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); |  | ||||||
|  |  | ||||||
|       compareResults = _mm_cmpgt_ps(maxValues, currentValues); |  | ||||||
|  |  | ||||||
|       maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); |  | ||||||
|       maxValues      = _mm_blendv_ps(currentValues, maxValues, compareResults); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // Calculate the largest value from the remaining 4 points |  | ||||||
|     _mm_store_ps(maxValuesBuffer, maxValues); |  | ||||||
|     _mm_store_ps(maxIndexesBuffer, maxValuesIndex); |  | ||||||
|  |  | ||||||
|     for(number = 0; number < 4; number++){ |  | ||||||
|       if(maxValuesBuffer[number] > max){ |  | ||||||
| 	index = maxIndexesBuffer[number]; |  | ||||||
| 	max = maxValuesBuffer[number]; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(;number < num_points; number++){ |  | ||||||
|       if(src0[number] > max){ |  | ||||||
| 	index = number; |  | ||||||
| 	max = src0[number]; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     target[0] = (unsigned int)index; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_SSE4_1*/ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include<xmmintrin.h> |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) { |  | ||||||
|   if(num_points > 0){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     float* inputPtr = (float*)src0; |  | ||||||
|  |  | ||||||
|     __m128 indexIncrementValues = _mm_set1_ps(4); |  | ||||||
|     __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); |  | ||||||
|  |  | ||||||
|     float max = src0[0]; |  | ||||||
|     float index = 0; |  | ||||||
|     __m128 maxValues = _mm_set1_ps(max); |  | ||||||
|     __m128 maxValuesIndex = _mm_setzero_ps(); |  | ||||||
|     __m128 compareResults; |  | ||||||
|     __m128 currentValues; |  | ||||||
|  |  | ||||||
|     __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; |  | ||||||
|     __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; |  | ||||||
|  |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|  |  | ||||||
|       currentValues  = _mm_load_ps(inputPtr); inputPtr += 4; |  | ||||||
|       currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); |  | ||||||
|  |  | ||||||
|       compareResults = _mm_cmpgt_ps(maxValues, currentValues); |  | ||||||
|  |  | ||||||
|       maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); |  | ||||||
|  |  | ||||||
|       maxValues      = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // Calculate the largest value from the remaining 4 points |  | ||||||
|     _mm_store_ps(maxValuesBuffer, maxValues); |  | ||||||
|     _mm_store_ps(maxIndexesBuffer, maxValuesIndex); |  | ||||||
|  |  | ||||||
|     for(number = 0; number < 4; number++){ |  | ||||||
|       if(maxValuesBuffer[number] > max){ |  | ||||||
| 	index = maxIndexesBuffer[number]; |  | ||||||
| 	max = maxValuesBuffer[number]; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(;number < num_points; number++){ |  | ||||||
|       if(src0[number] > max){ |  | ||||||
| 	index = number; |  | ||||||
| 	max = src0[number]; |  | ||||||
|       } |  | ||||||
|     } |  | ||||||
|     target[0] = (unsigned int)index; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_SSE*/ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
| static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) { |  | ||||||
|   if(num_points > 0){ |  | ||||||
|     float max = src0[0]; |  | ||||||
|     unsigned int index = 0; |  | ||||||
|  |  | ||||||
|     unsigned int i = 1; |  | ||||||
|  |  | ||||||
|     for(; i < num_points; ++i) { |  | ||||||
|  |  | ||||||
|       if(src0[i] > max){ |  | ||||||
| 	index = i; |  | ||||||
| 	max = src0[i]; |  | ||||||
|       } |  | ||||||
|  |  | ||||||
|     } |  | ||||||
|     target[0] = index; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/ |  | ||||||
| @@ -1,302 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <math.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 |  | ||||||
| #include <emmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value |  | ||||||
|     \param inputVector The floating point input data buffer |  | ||||||
|     \param outputVector The 16 bit output data buffer |  | ||||||
|     \param scalar The value multiplied against each point in the input buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|     \note Input buffer does NOT need to be properly aligned |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|   unsigned int number = 0; |  | ||||||
|  |  | ||||||
|   const unsigned int eighthPoints = num_points / 8; |  | ||||||
|  |  | ||||||
|   const float* inputVectorPtr = (const float*)inputVector; |  | ||||||
|   int16_t* outputVectorPtr = outputVector; |  | ||||||
|  |  | ||||||
|   float min_val = -32768; |  | ||||||
|   float max_val = 32767; |  | ||||||
|   float r; |  | ||||||
|  |  | ||||||
|   __m128 vScalar = _mm_set_ps1(scalar); |  | ||||||
|   __m128 inputVal1, inputVal2; |  | ||||||
|   __m128i intInputVal1, intInputVal2; |  | ||||||
|   __m128 ret1, ret2; |  | ||||||
|   __m128 vmin_val = _mm_set_ps1(min_val); |  | ||||||
|   __m128 vmax_val = _mm_set_ps1(max_val); |  | ||||||
|  |  | ||||||
|   for(;number < eighthPoints; number++){ |  | ||||||
|     inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; |  | ||||||
|     inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; |  | ||||||
|  |  | ||||||
|     // Scale and clip |  | ||||||
|     ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); |  | ||||||
|     ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); |  | ||||||
|  |  | ||||||
|     intInputVal1 = _mm_cvtps_epi32(ret1); |  | ||||||
|     intInputVal2 = _mm_cvtps_epi32(ret2); |  | ||||||
|  |  | ||||||
|     intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); |  | ||||||
|  |  | ||||||
|     _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); |  | ||||||
|     outputVectorPtr += 8; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   number = eighthPoints * 8; |  | ||||||
|   for(; number < num_points; number++){ |  | ||||||
|     r = inputVector[number] * scalar; |  | ||||||
|     if(r > max_val) |  | ||||||
|       r = max_val; |  | ||||||
|     else if(r < min_val) |  | ||||||
|       r = min_val; |  | ||||||
|     outputVector[number] = (int16_t)rintf(r); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE2 */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include <xmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value |  | ||||||
|     \param inputVector The floating point input data buffer |  | ||||||
|     \param outputVector The 16 bit output data buffer |  | ||||||
|     \param scalar The value multiplied against each point in the input buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|     \note Input buffer does NOT need to be properly aligned |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|   unsigned int number = 0; |  | ||||||
|  |  | ||||||
|   const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|   const float* inputVectorPtr = (const float*)inputVector; |  | ||||||
|   int16_t* outputVectorPtr = outputVector; |  | ||||||
|  |  | ||||||
|   float min_val = -32768; |  | ||||||
|   float max_val = 32767; |  | ||||||
|   float r; |  | ||||||
|  |  | ||||||
|   __m128 vScalar = _mm_set_ps1(scalar); |  | ||||||
|   __m128 ret; |  | ||||||
|   __m128 vmin_val = _mm_set_ps1(min_val); |  | ||||||
|   __m128 vmax_val = _mm_set_ps1(max_val); |  | ||||||
|  |  | ||||||
|   __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; |  | ||||||
|  |  | ||||||
|   for(;number < quarterPoints; number++){ |  | ||||||
|     ret = _mm_loadu_ps(inputVectorPtr); |  | ||||||
|     inputVectorPtr += 4; |  | ||||||
|  |  | ||||||
|     // Scale and clip |  | ||||||
|     ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); |  | ||||||
|  |  | ||||||
|     _mm_store_ps(outputFloatBuffer, ret); |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   number = quarterPoints * 4; |  | ||||||
|   for(; number < num_points; number++){ |  | ||||||
|     r = inputVector[number] * scalar; |  | ||||||
|     if(r > max_val) |  | ||||||
|       r = max_val; |  | ||||||
|     else if(r < min_val) |  | ||||||
|       r = min_val; |  | ||||||
|     outputVector[number] = (int16_t)rintf(r); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value |  | ||||||
|     \param inputVector The floating point input data buffer |  | ||||||
|     \param outputVector The 16 bit output data buffer |  | ||||||
|     \param scalar The value multiplied against each point in the input buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|     \note Input buffer does NOT need to be properly aligned |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|   int16_t* outputVectorPtr = outputVector; |  | ||||||
|   const float* inputVectorPtr = inputVector; |  | ||||||
|   unsigned int number = 0; |  | ||||||
|   float min_val = -32768; |  | ||||||
|   float max_val = 32767; |  | ||||||
|   float r; |  | ||||||
|  |  | ||||||
|   for(number = 0; number < num_points; number++){ |  | ||||||
|     r = *inputVectorPtr++  * scalar; |  | ||||||
|     if(r > max_val) |  | ||||||
|       r = max_val; |  | ||||||
|     else if(r < min_val) |  | ||||||
|       r = min_val; |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(r); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */ |  | ||||||
| #ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H |  | ||||||
|  |  | ||||||
| #include <volk/volk_common.h> |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <math.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE2 |  | ||||||
| #include <emmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value |  | ||||||
|     \param inputVector The floating point input data buffer |  | ||||||
|     \param outputVector The 16 bit output data buffer |  | ||||||
|     \param scalar The value multiplied against each point in the input buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|   unsigned int number = 0; |  | ||||||
|  |  | ||||||
|   const unsigned int eighthPoints = num_points / 8; |  | ||||||
|  |  | ||||||
|   const float* inputVectorPtr = (const float*)inputVector; |  | ||||||
|   int16_t* outputVectorPtr = outputVector; |  | ||||||
|  |  | ||||||
|   float min_val = -32768; |  | ||||||
|   float max_val = 32767; |  | ||||||
|   float r; |  | ||||||
|  |  | ||||||
|   __m128 vScalar = _mm_set_ps1(scalar); |  | ||||||
|   __m128 inputVal1, inputVal2; |  | ||||||
|   __m128i intInputVal1, intInputVal2; |  | ||||||
|   __m128 ret1, ret2; |  | ||||||
|   __m128 vmin_val = _mm_set_ps1(min_val); |  | ||||||
|   __m128 vmax_val = _mm_set_ps1(max_val); |  | ||||||
|  |  | ||||||
|   for(;number < eighthPoints; number++){ |  | ||||||
|     inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; |  | ||||||
|     inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; |  | ||||||
|  |  | ||||||
|     // Scale and clip |  | ||||||
|     ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); |  | ||||||
|     ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); |  | ||||||
|  |  | ||||||
|     intInputVal1 = _mm_cvtps_epi32(ret1); |  | ||||||
|     intInputVal2 = _mm_cvtps_epi32(ret2); |  | ||||||
|  |  | ||||||
|     intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); |  | ||||||
|  |  | ||||||
|     _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); |  | ||||||
|     outputVectorPtr += 8; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   number = eighthPoints * 8; |  | ||||||
|   for(; number < num_points; number++){ |  | ||||||
|     r = inputVector[number] * scalar; |  | ||||||
|     if(r > max_val) |  | ||||||
|       r = max_val; |  | ||||||
|     else if(r < min_val) |  | ||||||
|       r = min_val; |  | ||||||
|     outputVector[number] = (int16_t)rintf(r); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE2 */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include <xmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value |  | ||||||
|     \param inputVector The floating point input data buffer |  | ||||||
|     \param outputVector The 16 bit output data buffer |  | ||||||
|     \param scalar The value multiplied against each point in the input buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|   unsigned int number = 0; |  | ||||||
|  |  | ||||||
|   const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|   const float* inputVectorPtr = (const float*)inputVector; |  | ||||||
|   int16_t* outputVectorPtr = outputVector; |  | ||||||
|  |  | ||||||
|   float min_val = -32768; |  | ||||||
|   float max_val = 32767; |  | ||||||
|   float r; |  | ||||||
|  |  | ||||||
|   __m128 vScalar = _mm_set_ps1(scalar); |  | ||||||
|   __m128 ret; |  | ||||||
|   __m128 vmin_val = _mm_set_ps1(min_val); |  | ||||||
|   __m128 vmax_val = _mm_set_ps1(max_val); |  | ||||||
|  |  | ||||||
|   __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; |  | ||||||
|  |  | ||||||
|   for(;number < quarterPoints; number++){ |  | ||||||
|     ret = _mm_load_ps(inputVectorPtr); |  | ||||||
|     inputVectorPtr += 4; |  | ||||||
|  |  | ||||||
|     // Scale and clip |  | ||||||
|     ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); |  | ||||||
|  |  | ||||||
|     _mm_store_ps(outputFloatBuffer, ret); |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   number = quarterPoints * 4; |  | ||||||
|   for(; number < num_points; number++){ |  | ||||||
|     r = inputVector[number] * scalar; |  | ||||||
|     if(r > max_val) |  | ||||||
|       r = max_val; |  | ||||||
|     else if(r < min_val) |  | ||||||
|       r = min_val; |  | ||||||
|     outputVector[number] = (int16_t)rintf(r); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value |  | ||||||
|     \param inputVector The floating point input data buffer |  | ||||||
|     \param outputVector The 16 bit output data buffer |  | ||||||
|     \param scalar The value multiplied against each point in the input buffer |  | ||||||
|     \param num_points The number of data values to be converted |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ |  | ||||||
|   int16_t* outputVectorPtr = outputVector; |  | ||||||
|   const float* inputVectorPtr = inputVector; |  | ||||||
|   unsigned int number = 0; |  | ||||||
|   float min_val = -32768; |  | ||||||
|   float max_val = 32767; |  | ||||||
|   float r; |  | ||||||
|  |  | ||||||
|   for(number = 0; number < num_points; number++){ |  | ||||||
|     r  = *inputVectorPtr++ * scalar; |  | ||||||
|     if(r < min_val) |  | ||||||
|       r = min_val; |  | ||||||
|     else if(r > max_val) |  | ||||||
|       r = max_val; |  | ||||||
|     *outputVectorPtr++ = (int16_t)rintf(r); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */ |  | ||||||
| @@ -1,147 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include <xmmintrin.h> |  | ||||||
| /*! |  | ||||||
|   \brief Adds the two input vectors and store their results in the third vector |  | ||||||
|   \param cVector The vector where the results will be stored |  | ||||||
|   \param aVector One of the vectors to be added |  | ||||||
|   \param bVector One of the vectors to be added |  | ||||||
|   \param num_points The number of values in aVector and bVector to be added together and stored into cVector |  | ||||||
| */ |  | ||||||
| static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     float* cPtr = cVector; |  | ||||||
|     const float* aPtr = aVector; |  | ||||||
|     const float* bPtr=  bVector; |  | ||||||
|  |  | ||||||
|     __m128 aVal, bVal, cVal; |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|  |  | ||||||
|       aVal = _mm_loadu_ps(aPtr); |  | ||||||
|       bVal = _mm_loadu_ps(bPtr); |  | ||||||
|  |  | ||||||
|       cVal = _mm_add_ps(aVal, bVal); |  | ||||||
|  |  | ||||||
|       _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container |  | ||||||
|  |  | ||||||
|       aPtr += 4; |  | ||||||
|       bPtr += 4; |  | ||||||
|       cPtr += 4; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(;number < num_points; number++){ |  | ||||||
|       *cPtr++ = (*aPtr++) + (*bPtr++); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
| /*! |  | ||||||
|   \brief Adds the two input vectors and store their results in the third vector |  | ||||||
|   \param cVector The vector where the results will be stored |  | ||||||
|   \param aVector One of the vectors to be added |  | ||||||
|   \param bVector One of the vectors to be added |  | ||||||
|   \param num_points The number of values in aVector and bVector to be added together and stored into cVector |  | ||||||
| */ |  | ||||||
| static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ |  | ||||||
|     float* cPtr = cVector; |  | ||||||
|     const float* aPtr = aVector; |  | ||||||
|     const float* bPtr=  bVector; |  | ||||||
|     unsigned int number = 0; |  | ||||||
|  |  | ||||||
|     for(number = 0; number < num_points; number++){ |  | ||||||
|       *cPtr++ = (*aPtr++) + (*bPtr++); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */ |  | ||||||
| #ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include <xmmintrin.h> |  | ||||||
| /*! |  | ||||||
|   \brief Adds the two input vectors and store their results in the third vector |  | ||||||
|   \param cVector The vector where the results will be stored |  | ||||||
|   \param aVector One of the vectors to be added |  | ||||||
|   \param bVector One of the vectors to be added |  | ||||||
|   \param num_points The number of values in aVector and bVector to be added together and stored into cVector |  | ||||||
| */ |  | ||||||
| static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     float* cPtr = cVector; |  | ||||||
|     const float* aPtr = aVector; |  | ||||||
|     const float* bPtr=  bVector; |  | ||||||
|  |  | ||||||
|     __m128 aVal, bVal, cVal; |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|  |  | ||||||
|       aVal = _mm_load_ps(aPtr); |  | ||||||
|       bVal = _mm_load_ps(bPtr); |  | ||||||
|  |  | ||||||
|       cVal = _mm_add_ps(aVal, bVal); |  | ||||||
|  |  | ||||||
|       _mm_store_ps(cPtr,cVal); // Store the results back into the C container |  | ||||||
|  |  | ||||||
|       aPtr += 4; |  | ||||||
|       bPtr += 4; |  | ||||||
|       cPtr += 4; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(;number < num_points; number++){ |  | ||||||
|       *cPtr++ = (*aPtr++) + (*bPtr++); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
| /*! |  | ||||||
|   \brief Adds the two input vectors and store their results in the third vector |  | ||||||
|   \param cVector The vector where the results will be stored |  | ||||||
|   \param aVector One of the vectors to be added |  | ||||||
|   \param bVector One of the vectors to be added |  | ||||||
|   \param num_points The number of values in aVector and bVector to be added together and stored into cVector |  | ||||||
| */ |  | ||||||
| static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ |  | ||||||
|     float* cPtr = cVector; |  | ||||||
|     const float* aPtr = aVector; |  | ||||||
|     const float* bPtr=  bVector; |  | ||||||
|     unsigned int number = 0; |  | ||||||
|  |  | ||||||
|     for(number = 0; number < num_points; number++){ |  | ||||||
|       *cPtr++ = (*aPtr++) + (*bPtr++); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_ORC |  | ||||||
| /*! |  | ||||||
|   \brief Adds the two input vectors and store their results in the third vector |  | ||||||
|   \param cVector The vector where the results will be stored |  | ||||||
|   \param aVector One of the vectors to be added |  | ||||||
|   \param bVector One of the vectors to be added |  | ||||||
|   \param num_points The number of values in aVector and bVector to be added together and stored into cVector |  | ||||||
| */ |  | ||||||
| extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); |  | ||||||
| static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ |  | ||||||
|     volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_ORC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */ |  | ||||||
| @@ -1,127 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> |  | ||||||
| #include <float.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Takes the conjugate of a complex vector. |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector Vector to be conjugated |  | ||||||
|     \param num_points The number of complex values in aVector to be conjugated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int halfPoints = num_points / 2; |  | ||||||
|  |  | ||||||
|     __m128 x; |  | ||||||
|     lv_32fc_t* c = cVector; |  | ||||||
|     const lv_32fc_t* a = aVector; |  | ||||||
|  |  | ||||||
|     __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); |  | ||||||
|  |  | ||||||
|     for(;number < halfPoints; number++){ |  | ||||||
|  |  | ||||||
|       x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi |  | ||||||
|  |  | ||||||
|       x = _mm_xor_ps(x, conjugator); // conjugate register |  | ||||||
|  |  | ||||||
|       _mm_storeu_ps((float*)c,x); // Store the results back into the C container |  | ||||||
|  |  | ||||||
|       a += 2; |  | ||||||
|       c += 2; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if((num_points % 2) != 0) { |  | ||||||
|       *c = lv_conj(*a); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE3 */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Takes the conjugate of a complex vector. |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector Vector to be conjugated |  | ||||||
|     \param num_points The number of complex values in aVector to be conjugated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ |  | ||||||
|     lv_32fc_t* cPtr = cVector; |  | ||||||
|     const lv_32fc_t* aPtr = aVector; |  | ||||||
|     unsigned int number = 0; |  | ||||||
|  |  | ||||||
|     for(number = 0; number < num_points; number++){ |  | ||||||
|       *cPtr++ = lv_conj(*aPtr++); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */ |  | ||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> |  | ||||||
| #include <float.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Takes the conjugate of a complex vector. |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector Vector to be conjugated |  | ||||||
|     \param num_points The number of complex values in aVector to be conjugated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int halfPoints = num_points / 2; |  | ||||||
|  |  | ||||||
|     __m128 x; |  | ||||||
|     lv_32fc_t* c = cVector; |  | ||||||
|     const lv_32fc_t* a = aVector; |  | ||||||
|  |  | ||||||
|     __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); |  | ||||||
|  |  | ||||||
|     for(;number < halfPoints; number++){ |  | ||||||
|  |  | ||||||
|       x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi |  | ||||||
|  |  | ||||||
|       x = _mm_xor_ps(x, conjugator); // conjugate register |  | ||||||
|  |  | ||||||
|       _mm_store_ps((float*)c,x); // Store the results back into the C container |  | ||||||
|  |  | ||||||
|       a += 2; |  | ||||||
|       c += 2; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if((num_points % 2) != 0) { |  | ||||||
|       *c = lv_conj(*a); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE3 */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Takes the conjugate of a complex vector. |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector Vector to be conjugated |  | ||||||
|     \param num_points The number of complex values in aVector to be conjugated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ |  | ||||||
|     lv_32fc_t* cPtr = cVector; |  | ||||||
|     const lv_32fc_t* aPtr = aVector; |  | ||||||
|     unsigned int number = 0; |  | ||||||
|  |  | ||||||
|     for(number = 0; number < num_points; number++){ |  | ||||||
|       *cPtr++ = lv_conj(*aPtr++); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */ |  | ||||||
| @@ -1,228 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <math.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector |  | ||||||
|     \param complexVector The vector containing the complex input values |  | ||||||
|     \param magnitudeVector The vector containing the real output values |  | ||||||
|     \param num_points The number of complex values in complexVector to be calculated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     const float* complexVectorPtr = (float*)complexVector; |  | ||||||
|     float* magnitudeVectorPtr = magnitudeVector; |  | ||||||
|  |  | ||||||
|     __m128 cplxValue1, cplxValue2, result; |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|       cplxValue1 = _mm_loadu_ps(complexVectorPtr); |  | ||||||
|       complexVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       cplxValue2 = _mm_loadu_ps(complexVectorPtr); |  | ||||||
|       complexVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values |  | ||||||
|       cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values |  | ||||||
|  |  | ||||||
|       result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values |  | ||||||
|  |  | ||||||
|       _mm_storeu_ps(magnitudeVectorPtr, result); |  | ||||||
|       magnitudeVectorPtr += 4; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(; number < num_points; number++){ |  | ||||||
|       float val1Real = *complexVectorPtr++; |  | ||||||
|       float val1Imag = *complexVectorPtr++; |  | ||||||
|       *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE3 */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include <xmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector |  | ||||||
|     \param complexVector The vector containing the complex input values |  | ||||||
|     \param magnitudeVector The vector containing the real output values |  | ||||||
|     \param num_points The number of complex values in complexVector to be calculated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     const float* complexVectorPtr = (float*)complexVector; |  | ||||||
|     float* magnitudeVectorPtr = magnitudeVector; |  | ||||||
|  |  | ||||||
|     __m128 cplxValue1, cplxValue2, iValue, qValue, result; |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|       cplxValue1 = _mm_loadu_ps(complexVectorPtr); |  | ||||||
|       complexVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       cplxValue2 = _mm_loadu_ps(complexVectorPtr); |  | ||||||
|       complexVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       // Arrange in i1i2i3i4 format |  | ||||||
|       iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); |  | ||||||
|       // Arrange in q1q2q3q4 format |  | ||||||
|       qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); |  | ||||||
|  |  | ||||||
|       iValue = _mm_mul_ps(iValue, iValue); // Square the I values |  | ||||||
|       qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values |  | ||||||
|  |  | ||||||
|       result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values |  | ||||||
|  |  | ||||||
|       _mm_storeu_ps(magnitudeVectorPtr, result); |  | ||||||
|       magnitudeVectorPtr += 4; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(; number < num_points; number++){ |  | ||||||
|        float val1Real = *complexVectorPtr++; |  | ||||||
|        float val1Imag = *complexVectorPtr++; |  | ||||||
|       *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector |  | ||||||
|     \param complexVector The vector containing the complex input values |  | ||||||
|     \param magnitudeVector The vector containing the real output values |  | ||||||
|     \param num_points The number of complex values in complexVector to be calculated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ |  | ||||||
|   const float* complexVectorPtr = (float*)complexVector; |  | ||||||
|   float* magnitudeVectorPtr = magnitudeVector; |  | ||||||
|   unsigned int number = 0; |  | ||||||
|   for(number = 0; number < num_points; number++){ |  | ||||||
|     const float real = *complexVectorPtr++; |  | ||||||
|     const float imag = *complexVectorPtr++; |  | ||||||
|     *magnitudeVectorPtr++ = (real*real) + (imag*imag); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ |  | ||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <math.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector |  | ||||||
|     \param complexVector The vector containing the complex input values |  | ||||||
|     \param magnitudeVector The vector containing the real output values |  | ||||||
|     \param num_points The number of complex values in complexVector to be calculated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     const float* complexVectorPtr = (float*)complexVector; |  | ||||||
|     float* magnitudeVectorPtr = magnitudeVector; |  | ||||||
|  |  | ||||||
|     __m128 cplxValue1, cplxValue2, result; |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|       cplxValue1 = _mm_load_ps(complexVectorPtr); |  | ||||||
|       complexVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       cplxValue2 = _mm_load_ps(complexVectorPtr); |  | ||||||
|       complexVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values |  | ||||||
|       cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values |  | ||||||
|  |  | ||||||
|       result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values |  | ||||||
|  |  | ||||||
|       _mm_store_ps(magnitudeVectorPtr, result); |  | ||||||
|       magnitudeVectorPtr += 4; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(; number < num_points; number++){ |  | ||||||
|       float val1Real = *complexVectorPtr++; |  | ||||||
|       float val1Imag = *complexVectorPtr++; |  | ||||||
|       *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE3 */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE |  | ||||||
| #include <xmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector |  | ||||||
|     \param complexVector The vector containing the complex input values |  | ||||||
|     \param magnitudeVector The vector containing the real output values |  | ||||||
|     \param num_points The number of complex values in complexVector to be calculated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ |  | ||||||
|     unsigned int number = 0; |  | ||||||
|     const unsigned int quarterPoints = num_points / 4; |  | ||||||
|  |  | ||||||
|     const float* complexVectorPtr = (float*)complexVector; |  | ||||||
|     float* magnitudeVectorPtr = magnitudeVector; |  | ||||||
|  |  | ||||||
|     __m128 cplxValue1, cplxValue2, iValue, qValue, result; |  | ||||||
|     for(;number < quarterPoints; number++){ |  | ||||||
|       cplxValue1 = _mm_load_ps(complexVectorPtr); |  | ||||||
|       complexVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       cplxValue2 = _mm_load_ps(complexVectorPtr); |  | ||||||
|       complexVectorPtr += 4; |  | ||||||
|  |  | ||||||
|       // Arrange in i1i2i3i4 format |  | ||||||
|       iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); |  | ||||||
|       // Arrange in q1q2q3q4 format |  | ||||||
|       qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); |  | ||||||
|  |  | ||||||
|       iValue = _mm_mul_ps(iValue, iValue); // Square the I values |  | ||||||
|       qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values |  | ||||||
|  |  | ||||||
|       result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values |  | ||||||
|  |  | ||||||
|       _mm_store_ps(magnitudeVectorPtr, result); |  | ||||||
|       magnitudeVectorPtr += 4; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     number = quarterPoints * 4; |  | ||||||
|     for(; number < num_points; number++){ |  | ||||||
|        float val1Real = *complexVectorPtr++; |  | ||||||
|        float val1Imag = *complexVectorPtr++; |  | ||||||
|       *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector |  | ||||||
|     \param complexVector The vector containing the complex input values |  | ||||||
|     \param magnitudeVector The vector containing the real output values |  | ||||||
|     \param num_points The number of complex values in complexVector to be calculated and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ |  | ||||||
|   const float* complexVectorPtr = (float*)complexVector; |  | ||||||
|   float* magnitudeVectorPtr = magnitudeVector; |  | ||||||
|   unsigned int number = 0; |  | ||||||
|   for(number = 0; number < num_points; number++){ |  | ||||||
|     const float real = *complexVectorPtr++; |  | ||||||
|     const float imag = *complexVectorPtr++; |  | ||||||
|     *magnitudeVectorPtr++ = (real*real) + (imag*imag); |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ |  | ||||||
| @@ -99,7 +99,7 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_ | |||||||
|     if (num_points%4!=0) |     if (num_points%4!=0) | ||||||
|     { |     { | ||||||
|         __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; |         __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; | ||||||
|         _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); |         _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array); | ||||||
|          |          | ||||||
|         int associated_chip_index; |         int associated_chip_index; | ||||||
|         float tcode_half_chips = tcode_half_chips_stored[0]; |         float tcode_half_chips = tcode_half_chips_stored[0]; | ||||||
| @@ -215,7 +215,7 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_ | |||||||
|     if (num_points%4!=0) |     if (num_points%4!=0) | ||||||
|     { |     { | ||||||
|         __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; |         __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; | ||||||
|         _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); |         _mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array); | ||||||
|          |          | ||||||
|         int associated_chip_index; |         int associated_chip_index; | ||||||
|         float tcode_half_chips = tcode_half_chips_stored[0]; |         float tcode_half_chips = tcode_half_chips_stored[0]; | ||||||
|   | |||||||
| @@ -1,178 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> |  | ||||||
| #include <float.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
| /*! |  | ||||||
|   \brief Multiplies the input vector by a scalar and stores the results in the third vector |  | ||||||
|   \param cVector The vector where the results will be stored |  | ||||||
|   \param aVector The vector to be multiplied |  | ||||||
|   \param scalar The complex scalar to multiply aVector |  | ||||||
|   \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector |  | ||||||
| */ |  | ||||||
| static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ |  | ||||||
|   unsigned int number = 0; |  | ||||||
|     const unsigned int halfPoints = num_points / 2; |  | ||||||
|  |  | ||||||
|     __m128 x, yl, yh, z, tmp1, tmp2; |  | ||||||
|     lv_32fc_t* c = cVector; |  | ||||||
|     const lv_32fc_t* a = aVector; |  | ||||||
|  |  | ||||||
|     // Set up constant scalar vector |  | ||||||
|     yl = _mm_set_ps1(lv_creal(scalar)); |  | ||||||
|     yh = _mm_set_ps1(lv_cimag(scalar)); |  | ||||||
|  |  | ||||||
|     for(;number < halfPoints; number++){ |  | ||||||
|  |  | ||||||
|       x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi |  | ||||||
|  |  | ||||||
|       tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |  | ||||||
|  |  | ||||||
|       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br |  | ||||||
|  |  | ||||||
|       tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di |  | ||||||
|  |  | ||||||
|       z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |  | ||||||
|  |  | ||||||
|       _mm_storeu_ps((float*)c,z); // Store the results back into the C container |  | ||||||
|  |  | ||||||
|       a += 2; |  | ||||||
|       c += 2; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if((num_points % 2) != 0) { |  | ||||||
|       *c = (*a) * scalar; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
| /*! |  | ||||||
|   \brief Multiplies the input vector by a scalar and stores the results in the third vector |  | ||||||
|   \param cVector The vector where the results will be stored |  | ||||||
|   \param aVector The vector to be multiplied |  | ||||||
|   \param scalar The complex scalar to multiply aVector |  | ||||||
|   \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector |  | ||||||
| */ |  | ||||||
| static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ |  | ||||||
|     lv_32fc_t* cPtr = cVector; |  | ||||||
|     const lv_32fc_t* aPtr = aVector; |  | ||||||
|     unsigned int number = num_points; |  | ||||||
|  |  | ||||||
|     // unwrap loop |  | ||||||
|     while (number >= 8){ |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       number -= 8; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // clean up any remaining |  | ||||||
|     while (number-- > 0) |  | ||||||
|       *cPtr++ = *aPtr++ * scalar; |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ |  | ||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> |  | ||||||
| #include <float.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies the two input complex vectors and stores their results in the third vector |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector One of the vectors to be multiplied |  | ||||||
|     \param bVector One of the vectors to be multiplied |  | ||||||
|     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ |  | ||||||
|   unsigned int number = 0; |  | ||||||
|     const unsigned int halfPoints = num_points / 2; |  | ||||||
|  |  | ||||||
|     __m128 x, yl, yh, z, tmp1, tmp2; |  | ||||||
|     lv_32fc_t* c = cVector; |  | ||||||
|     const lv_32fc_t* a = aVector; |  | ||||||
|  |  | ||||||
|     // Set up constant scalar vector |  | ||||||
|     yl = _mm_set_ps1(lv_creal(scalar)); |  | ||||||
|     yh = _mm_set_ps1(lv_cimag(scalar)); |  | ||||||
|  |  | ||||||
|     for(;number < halfPoints; number++){ |  | ||||||
|  |  | ||||||
|       x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi |  | ||||||
|  |  | ||||||
|       tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |  | ||||||
|  |  | ||||||
|       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br |  | ||||||
|  |  | ||||||
|       tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di |  | ||||||
|  |  | ||||||
|       z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |  | ||||||
|  |  | ||||||
|       _mm_store_ps((float*)c,z); // Store the results back into the C container |  | ||||||
|  |  | ||||||
|       a += 2; |  | ||||||
|       c += 2; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if((num_points % 2) != 0) { |  | ||||||
|       *c = (*a) * scalar; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies the two input complex vectors and stores their results in the third vector |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector One of the vectors to be multiplied |  | ||||||
|     \param bVector One of the vectors to be multiplied |  | ||||||
|     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ |  | ||||||
|     lv_32fc_t* cPtr = cVector; |  | ||||||
|     const lv_32fc_t* aPtr = aVector; |  | ||||||
|     unsigned int number = num_points; |  | ||||||
|  |  | ||||||
|     // unwrap loop |  | ||||||
|     while (number >= 8){ |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       *cPtr++ = (*aPtr++) * scalar; |  | ||||||
|       number -= 8; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // clean up any remaining |  | ||||||
|     while (number-- > 0) |  | ||||||
|       *cPtr++ = *aPtr++ * scalar; |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ |  | ||||||
| @@ -1,759 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H |  | ||||||
|  |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_common.h> |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <string.h> |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|  |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { |  | ||||||
|  |  | ||||||
|   float * res = (float*) result; |  | ||||||
|   float * in = (float*) input; |  | ||||||
|   float * tp = (float*) taps; |  | ||||||
|   unsigned int n_2_ccomplex_blocks = num_points/2; |  | ||||||
|   unsigned int isodd = num_points & 1; |  | ||||||
|  |  | ||||||
|   float sum0[2] = {0,0}; |  | ||||||
|   float sum1[2] = {0,0}; |  | ||||||
|   unsigned int i = 0; |  | ||||||
|  |  | ||||||
|   for(i = 0; i < n_2_ccomplex_blocks; ++i) { |  | ||||||
|     sum0[0] += in[0] * tp[0] - in[1] * tp[1]; |  | ||||||
|     sum0[1] += in[0] * tp[1] + in[1] * tp[0]; |  | ||||||
|     sum1[0] += in[2] * tp[2] - in[3] * tp[3]; |  | ||||||
|     sum1[1] += in[2] * tp[3] + in[3] * tp[2]; |  | ||||||
|  |  | ||||||
|     in += 4; |  | ||||||
|     tp += 4; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   res[0] = sum0[0] + sum1[0]; |  | ||||||
|   res[1] = sum0[1] + sum1[1]; |  | ||||||
|  |  | ||||||
|   // Cleanup if we had an odd number of points |  | ||||||
|   for(i = 0; i < isodd; ++i) { |  | ||||||
|     *result += input[num_points - 1] * taps[num_points - 1]; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #if LV_HAVE_SSE && LV_HAVE_64 |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { |  | ||||||
|  |  | ||||||
|   const unsigned int num_bytes = num_points*8; |  | ||||||
|   unsigned int isodd = num_points & 1; |  | ||||||
|  |  | ||||||
|   asm |  | ||||||
|     ( |  | ||||||
|      "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t" |  | ||||||
|      "#                         const float *taps, unsigned num_bytes)\n\t" |  | ||||||
|      "#    float sum0 = 0;\n\t" |  | ||||||
|      "#    float sum1 = 0;\n\t" |  | ||||||
|      "#    float sum2 = 0;\n\t" |  | ||||||
|      "#    float sum3 = 0;\n\t" |  | ||||||
|      "#    do {\n\t" |  | ||||||
|      "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" |  | ||||||
|      "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" |  | ||||||
|      "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" |  | ||||||
|      "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" |  | ||||||
|      "#      input += 4;\n\t" |  | ||||||
|      "#      taps += 4;  \n\t" |  | ||||||
|      "#    } while (--n_2_ccomplex_blocks != 0);\n\t" |  | ||||||
|      "#    result[0] = sum0 + sum2;\n\t" |  | ||||||
|      "#    result[1] = sum1 + sum3;\n\t" |  | ||||||
|      "# TODO: prefetch and better scheduling\n\t" |  | ||||||
|      "  xor    %%r9,  %%r9\n\t" |  | ||||||
|      "  xor    %%r10, %%r10\n\t" |  | ||||||
|      "  movq   %%rcx, %%rax\n\t" |  | ||||||
|      "  movq   %%rcx, %%r8\n\t" |  | ||||||
|      "  movq   %[rsi],  %%r9\n\t" |  | ||||||
|      "  movq   %[rdx], %%r10\n\t" |  | ||||||
|      "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t" |  | ||||||
|      "	movups	0(%%r9), %%xmm0\n\t" |  | ||||||
|      "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t" |  | ||||||
|      "	movups	0(%%r10), %%xmm2\n\t" |  | ||||||
|      "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t" |  | ||||||
|      "  shr     $4, %%r8\n\t" |  | ||||||
|      "	jmp	.%=L1_test\n\t" |  | ||||||
|      "	# 4 taps / loop\n\t" |  | ||||||
|      "	# something like ?? cycles / loop\n\t" |  | ||||||
|      ".%=Loop1:	\n\t" |  | ||||||
|      "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" |  | ||||||
|      "#	movups	(%%r9), %%xmmA\n\t" |  | ||||||
|      "#	movups	(%%r10), %%xmmB\n\t" |  | ||||||
|      "#	movups	%%xmmA, %%xmmZ\n\t" |  | ||||||
|      "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t" |  | ||||||
|      "#	mulps	%%xmmB, %%xmmA\n\t" |  | ||||||
|      "#	mulps	%%xmmZ, %%xmmB\n\t" |  | ||||||
|      "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" |  | ||||||
|      "#	xorps	%%xmmPN, %%xmmA\n\t" |  | ||||||
|      "#	movups	%%xmmA, %%xmmZ\n\t" |  | ||||||
|      "#	unpcklps %%xmmB, %%xmmA\n\t" |  | ||||||
|      "#	unpckhps %%xmmB, %%xmmZ\n\t" |  | ||||||
|      "#	movups	%%xmmZ, %%xmmY\n\t" |  | ||||||
|      "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t" |  | ||||||
|      "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t" |  | ||||||
|      "#	addps	%%xmmZ, %%xmmA\n\t" |  | ||||||
|      "#	addps	%%xmmA, %%xmmC\n\t" |  | ||||||
|      "# A=xmm0, B=xmm2, Z=xmm4\n\t" |  | ||||||
|      "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" |  | ||||||
|      "	movups	16(%%r9), %%xmm1\n\t" |  | ||||||
|      "	movups	%%xmm0, %%xmm4\n\t" |  | ||||||
|      "	mulps	%%xmm2, %%xmm0\n\t" |  | ||||||
|      "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t" |  | ||||||
|      "	movups	16(%%r10), %%xmm3\n\t" |  | ||||||
|      "	movups	%%xmm1, %%xmm5\n\t" |  | ||||||
|      "	addps	%%xmm0, %%xmm6\n\t" |  | ||||||
|      "	mulps	%%xmm3, %%xmm1\n\t" |  | ||||||
|      "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t" |  | ||||||
|      "	addps	%%xmm1, %%xmm6\n\t" |  | ||||||
|      "	mulps	%%xmm4, %%xmm2\n\t" |  | ||||||
|      "	movups	32(%%r9), %%xmm0\n\t" |  | ||||||
|      "	addps	%%xmm2, %%xmm7\n\t" |  | ||||||
|      "	mulps	%%xmm5, %%xmm3\n\t" |  | ||||||
|      "	add	$32, %%r9\n\t" |  | ||||||
|      "	movups	32(%%r10), %%xmm2\n\t" |  | ||||||
|      "	addps	%%xmm3, %%xmm7\n\t" |  | ||||||
|      "	add	$32, %%r10\n\t" |  | ||||||
|      ".%=L1_test:\n\t" |  | ||||||
|      "	dec	%%rax\n\t" |  | ||||||
|      "	jge	.%=Loop1\n\t" |  | ||||||
|      "	# We've handled the bulk of multiplies up to here.\n\t" |  | ||||||
|      "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t" |  | ||||||
|      "	# If so, we've got 2 more taps to do.\n\t" |  | ||||||
|      "	and	$1, %%r8\n\t" |  | ||||||
|      "	je	.%=Leven\n\t" |  | ||||||
|      "	# The count was odd, do 2 more taps.\n\t" |  | ||||||
|      "	# Note that we've already got mm0/mm2 preloaded\n\t" |  | ||||||
|      "	# from the main loop.\n\t" |  | ||||||
|      "	movups	%%xmm0, %%xmm4\n\t" |  | ||||||
|      "	mulps	%%xmm2, %%xmm0\n\t" |  | ||||||
|      "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t" |  | ||||||
|      "	addps	%%xmm0, %%xmm6\n\t" |  | ||||||
|      "	mulps	%%xmm4, %%xmm2\n\t" |  | ||||||
|      "	addps	%%xmm2, %%xmm7\n\t" |  | ||||||
|      ".%=Leven:\n\t" |  | ||||||
|      "	# neg inversor\n\t" |  | ||||||
|      "	xorps	%%xmm1, %%xmm1\n\t" |  | ||||||
|      "	mov	$0x80000000, %%r9\n\t" |  | ||||||
|      "	movd	%%r9, %%xmm1\n\t" |  | ||||||
|      "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t" |  | ||||||
|      "	# pfpnacc\n\t" |  | ||||||
|      "	xorps	%%xmm1, %%xmm6\n\t" |  | ||||||
|      "	movups	%%xmm6, %%xmm2\n\t" |  | ||||||
|      "	unpcklps %%xmm7, %%xmm6\n\t" |  | ||||||
|      "	unpckhps %%xmm7, %%xmm2\n\t" |  | ||||||
|      "	movups	%%xmm2, %%xmm3\n\t" |  | ||||||
|      "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t" |  | ||||||
|      "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t" |  | ||||||
|      "	addps	%%xmm2, %%xmm6\n\t" |  | ||||||
|      "					# xmm6 = r1 i2 r3 i4\n\t" |  | ||||||
|      "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t" |  | ||||||
|      "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t" |  | ||||||
|      "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t" |  | ||||||
|      : |  | ||||||
|      :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) |  | ||||||
|      :"rax", "r8", "r9", "r10" |  | ||||||
|      ); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   if(isodd) { |  | ||||||
|     *result += input[num_points - 1] * taps[num_points - 1]; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   return; |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /* LV_HAVE_SSE && LV_HAVE_64 */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { |  | ||||||
|  |  | ||||||
|   lv_32fc_t dotProduct; |  | ||||||
|   memset(&dotProduct, 0x0, 2*sizeof(float)); |  | ||||||
|  |  | ||||||
|   unsigned int number = 0; |  | ||||||
|   const unsigned int halfPoints = num_points/2; |  | ||||||
|   unsigned int isodd = num_points & 1; |  | ||||||
|  |  | ||||||
|   __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; |  | ||||||
|  |  | ||||||
|   const lv_32fc_t* a = input; |  | ||||||
|   const lv_32fc_t* b = taps; |  | ||||||
|  |  | ||||||
|   dotProdVal = _mm_setzero_ps(); |  | ||||||
|  |  | ||||||
|   for(;number < halfPoints; number++){ |  | ||||||
|  |  | ||||||
|     x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi |  | ||||||
|     y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di |  | ||||||
|  |  | ||||||
|     yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr |  | ||||||
|     yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di |  | ||||||
|  |  | ||||||
|     tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |  | ||||||
|  |  | ||||||
|     x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br |  | ||||||
|  |  | ||||||
|     tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di |  | ||||||
|  |  | ||||||
|     z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |  | ||||||
|  |  | ||||||
|     dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together |  | ||||||
|  |  | ||||||
|     a += 2; |  | ||||||
|     b += 2; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; |  | ||||||
|  |  | ||||||
|   _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector |  | ||||||
|  |  | ||||||
|   dotProduct += ( dotProductVector[0] + dotProductVector[1] ); |  | ||||||
|  |  | ||||||
|   if(isodd) { |  | ||||||
|     dotProduct += input[num_points - 1] * taps[num_points - 1]; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   *result = dotProduct; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_SSE3*/ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE4_1 |  | ||||||
| #include <smmintrin.h> |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { |  | ||||||
|  |  | ||||||
|   unsigned int i = 0; |  | ||||||
|   const unsigned int qtr_points = num_points/4; |  | ||||||
|   const unsigned int isodd = num_points & 3; |  | ||||||
|  |  | ||||||
|   __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; |  | ||||||
|   float *p_input, *p_taps; |  | ||||||
|   __m64 *p_result; |  | ||||||
|  |  | ||||||
|   p_result = (__m64*)result; |  | ||||||
|   p_input = (float*)input; |  | ||||||
|   p_taps = (float*)taps; |  | ||||||
|  |  | ||||||
|   static const __m128i neg = {0x000000000000000080000000}; |  | ||||||
|  |  | ||||||
|   real0 = _mm_setzero_ps(); |  | ||||||
|   real1 = _mm_setzero_ps(); |  | ||||||
|   im0 = _mm_setzero_ps(); |  | ||||||
|   im1 = _mm_setzero_ps(); |  | ||||||
|  |  | ||||||
|   for(; i < qtr_points; ++i) { |  | ||||||
|     xmm0 = _mm_loadu_ps(p_input); |  | ||||||
|     xmm1 = _mm_loadu_ps(p_taps); |  | ||||||
|  |  | ||||||
|     p_input += 4; |  | ||||||
|     p_taps += 4; |  | ||||||
|  |  | ||||||
|     xmm2 = _mm_loadu_ps(p_input); |  | ||||||
|     xmm3 = _mm_loadu_ps(p_taps); |  | ||||||
|  |  | ||||||
|     p_input += 4; |  | ||||||
|     p_taps += 4; |  | ||||||
|  |  | ||||||
|     xmm4 = _mm_unpackhi_ps(xmm0, xmm2); |  | ||||||
|     xmm5 = _mm_unpackhi_ps(xmm1, xmm3); |  | ||||||
|     xmm0 = _mm_unpacklo_ps(xmm0, xmm2); |  | ||||||
|     xmm2 = _mm_unpacklo_ps(xmm1, xmm3); |  | ||||||
|  |  | ||||||
|     //imaginary vector from input |  | ||||||
|     xmm1 = _mm_unpackhi_ps(xmm0, xmm4); |  | ||||||
|     //real vector from input |  | ||||||
|     xmm3 = _mm_unpacklo_ps(xmm0, xmm4); |  | ||||||
|     //imaginary vector from taps |  | ||||||
|     xmm0 = _mm_unpackhi_ps(xmm2, xmm5); |  | ||||||
|     //real vector from taps |  | ||||||
|     xmm2 = _mm_unpacklo_ps(xmm2, xmm5); |  | ||||||
|  |  | ||||||
|     xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); |  | ||||||
|     xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); |  | ||||||
|  |  | ||||||
|     xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); |  | ||||||
|     xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); |  | ||||||
|  |  | ||||||
|     real0 = _mm_add_ps(xmm4, real0); |  | ||||||
|     real1 = _mm_add_ps(xmm5, real1); |  | ||||||
|     im0 = _mm_add_ps(xmm6, im0); |  | ||||||
|     im1 = _mm_add_ps(xmm7, im1); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); |  | ||||||
|  |  | ||||||
|   im0 = _mm_add_ps(im0, im1); |  | ||||||
|   real0 = _mm_add_ps(real0, real1); |  | ||||||
|  |  | ||||||
|   im0 = _mm_add_ps(im0, real0); |  | ||||||
|  |  | ||||||
|   _mm_storel_pi(p_result, im0); |  | ||||||
|  |  | ||||||
|   for(i = num_points-isodd; i < num_points; i++) { |  | ||||||
|     *result += input[i] * taps[i]; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_SSE4_1*/ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/ |  | ||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H |  | ||||||
|  |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_common.h> |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <string.h> |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|  |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { |  | ||||||
|  |  | ||||||
|   const unsigned int num_bytes = num_points*8; |  | ||||||
|  |  | ||||||
|   float * res = (float*) result; |  | ||||||
|   float * in = (float*) input; |  | ||||||
|   float * tp = (float*) taps; |  | ||||||
|   unsigned int n_2_ccomplex_blocks = num_bytes >> 4; |  | ||||||
|   unsigned int isodd = num_points & 1; |  | ||||||
|  |  | ||||||
|   float sum0[2] = {0,0}; |  | ||||||
|   float sum1[2] = {0,0}; |  | ||||||
|   unsigned int i = 0; |  | ||||||
|  |  | ||||||
|   for(i = 0; i < n_2_ccomplex_blocks; ++i) { |  | ||||||
|     sum0[0] += in[0] * tp[0] - in[1] * tp[1]; |  | ||||||
|     sum0[1] += in[0] * tp[1] + in[1] * tp[0]; |  | ||||||
|     sum1[0] += in[2] * tp[2] - in[3] * tp[3]; |  | ||||||
|     sum1[1] += in[2] * tp[3] + in[3] * tp[2]; |  | ||||||
|  |  | ||||||
|     in += 4; |  | ||||||
|     tp += 4; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   res[0] = sum0[0] + sum1[0]; |  | ||||||
|   res[1] = sum0[1] + sum1[1]; |  | ||||||
|  |  | ||||||
|   for(i = 0; i < isodd; ++i) { |  | ||||||
|     *result += input[num_points - 1] * taps[num_points - 1]; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_GENERIC*/ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #if LV_HAVE_SSE && LV_HAVE_64 |  | ||||||
|  |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { |  | ||||||
|  |  | ||||||
|   const unsigned int num_bytes = num_points*8; |  | ||||||
|   unsigned int isodd = num_points & 1; |  | ||||||
|  |  | ||||||
|   asm |  | ||||||
|     ( |  | ||||||
|      "#  ccomplex_dotprod_generic (float* result, const float *input,\n\t" |  | ||||||
|      "#                         const float *taps, unsigned num_bytes)\n\t" |  | ||||||
|      "#    float sum0 = 0;\n\t" |  | ||||||
|      "#    float sum1 = 0;\n\t" |  | ||||||
|      "#    float sum2 = 0;\n\t" |  | ||||||
|      "#    float sum3 = 0;\n\t" |  | ||||||
|      "#    do {\n\t" |  | ||||||
|      "#      sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" |  | ||||||
|      "#      sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" |  | ||||||
|      "#      sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" |  | ||||||
|      "#      sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" |  | ||||||
|      "#      input += 4;\n\t" |  | ||||||
|      "#      taps += 4;  \n\t" |  | ||||||
|      "#    } while (--n_2_ccomplex_blocks != 0);\n\t" |  | ||||||
|      "#    result[0] = sum0 + sum2;\n\t" |  | ||||||
|      "#    result[1] = sum1 + sum3;\n\t" |  | ||||||
|      "# TODO: prefetch and better scheduling\n\t" |  | ||||||
|      "  xor    %%r9,  %%r9\n\t" |  | ||||||
|      "  xor    %%r10, %%r10\n\t" |  | ||||||
|      "  movq   %%rcx, %%rax\n\t" |  | ||||||
|      "  movq   %%rcx, %%r8\n\t" |  | ||||||
|      "  movq   %[rsi],  %%r9\n\t" |  | ||||||
|      "  movq   %[rdx], %%r10\n\t" |  | ||||||
|      "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t" |  | ||||||
|      "	movaps	0(%%r9), %%xmm0\n\t" |  | ||||||
|      "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t" |  | ||||||
|      "	movaps	0(%%r10), %%xmm2\n\t" |  | ||||||
|      "	shr	$5, %%rax		# rax = n_2_ccomplex_blocks / 2\n\t" |  | ||||||
|      "  shr     $4, %%r8\n\t" |  | ||||||
|      "	jmp	.%=L1_test\n\t" |  | ||||||
|      "	# 4 taps / loop\n\t" |  | ||||||
|      "	# something like ?? cycles / loop\n\t" |  | ||||||
|      ".%=Loop1:	\n\t" |  | ||||||
|      "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" |  | ||||||
|      "#	movaps	(%%r9), %%xmmA\n\t" |  | ||||||
|      "#	movaps	(%%r10), %%xmmB\n\t" |  | ||||||
|      "#	movaps	%%xmmA, %%xmmZ\n\t" |  | ||||||
|      "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t" |  | ||||||
|      "#	mulps	%%xmmB, %%xmmA\n\t" |  | ||||||
|      "#	mulps	%%xmmZ, %%xmmB\n\t" |  | ||||||
|      "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" |  | ||||||
|      "#	xorps	%%xmmPN, %%xmmA\n\t" |  | ||||||
|      "#	movaps	%%xmmA, %%xmmZ\n\t" |  | ||||||
|      "#	unpcklps %%xmmB, %%xmmA\n\t" |  | ||||||
|      "#	unpckhps %%xmmB, %%xmmZ\n\t" |  | ||||||
|      "#	movaps	%%xmmZ, %%xmmY\n\t" |  | ||||||
|      "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t" |  | ||||||
|      "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t" |  | ||||||
|      "#	addps	%%xmmZ, %%xmmA\n\t" |  | ||||||
|      "#	addps	%%xmmA, %%xmmC\n\t" |  | ||||||
|      "# A=xmm0, B=xmm2, Z=xmm4\n\t" |  | ||||||
|      "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" |  | ||||||
|      "	movaps	16(%%r9), %%xmm1\n\t" |  | ||||||
|      "	movaps	%%xmm0, %%xmm4\n\t" |  | ||||||
|      "	mulps	%%xmm2, %%xmm0\n\t" |  | ||||||
|      "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t" |  | ||||||
|      "	movaps	16(%%r10), %%xmm3\n\t" |  | ||||||
|      "	movaps	%%xmm1, %%xmm5\n\t" |  | ||||||
|      "	addps	%%xmm0, %%xmm6\n\t" |  | ||||||
|      "	mulps	%%xmm3, %%xmm1\n\t" |  | ||||||
|      "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t" |  | ||||||
|      "	addps	%%xmm1, %%xmm6\n\t" |  | ||||||
|      "	mulps	%%xmm4, %%xmm2\n\t" |  | ||||||
|      "	movaps	32(%%r9), %%xmm0\n\t" |  | ||||||
|      "	addps	%%xmm2, %%xmm7\n\t" |  | ||||||
|      "	mulps	%%xmm5, %%xmm3\n\t" |  | ||||||
|      "	add	$32, %%r9\n\t" |  | ||||||
|      "	movaps	32(%%r10), %%xmm2\n\t" |  | ||||||
|      "	addps	%%xmm3, %%xmm7\n\t" |  | ||||||
|      "	add	$32, %%r10\n\t" |  | ||||||
|      ".%=L1_test:\n\t" |  | ||||||
|      "	dec	%%rax\n\t" |  | ||||||
|      "	jge	.%=Loop1\n\t" |  | ||||||
|      "	# We've handled the bulk of multiplies up to here.\n\t" |  | ||||||
|      "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t" |  | ||||||
|      "	# If so, we've got 2 more taps to do.\n\t" |  | ||||||
|      "	and	$1, %%r8\n\t" |  | ||||||
|      "	je	.%=Leven\n\t" |  | ||||||
|      "	# The count was odd, do 2 more taps.\n\t" |  | ||||||
|      "	# Note that we've already got mm0/mm2 preloaded\n\t" |  | ||||||
|      "	# from the main loop.\n\t" |  | ||||||
|      "	movaps	%%xmm0, %%xmm4\n\t" |  | ||||||
|      "	mulps	%%xmm2, %%xmm0\n\t" |  | ||||||
|      "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t" |  | ||||||
|      "	addps	%%xmm0, %%xmm6\n\t" |  | ||||||
|      "	mulps	%%xmm4, %%xmm2\n\t" |  | ||||||
|      "	addps	%%xmm2, %%xmm7\n\t" |  | ||||||
|      ".%=Leven:\n\t" |  | ||||||
|      "	# neg inversor\n\t" |  | ||||||
|      "	xorps	%%xmm1, %%xmm1\n\t" |  | ||||||
|      "	mov	$0x80000000, %%r9\n\t" |  | ||||||
|      "	movd	%%r9, %%xmm1\n\t" |  | ||||||
|      "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t" |  | ||||||
|      "	# pfpnacc\n\t" |  | ||||||
|      "	xorps	%%xmm1, %%xmm6\n\t" |  | ||||||
|      "	movaps	%%xmm6, %%xmm2\n\t" |  | ||||||
|      "	unpcklps %%xmm7, %%xmm6\n\t" |  | ||||||
|      "	unpckhps %%xmm7, %%xmm2\n\t" |  | ||||||
|      "	movaps	%%xmm2, %%xmm3\n\t" |  | ||||||
|      "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t" |  | ||||||
|      "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t" |  | ||||||
|      "	addps	%%xmm2, %%xmm6\n\t" |  | ||||||
|      "					# xmm6 = r1 i2 r3 i4\n\t" |  | ||||||
|      "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t" |  | ||||||
|      "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t" |  | ||||||
|      "	movlps	%%xmm6, (%[rdi])		# store low 2x32 bits (complex) to memory\n\t" |  | ||||||
|      : |  | ||||||
|      :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) |  | ||||||
|      :"rax", "r8", "r9", "r10" |  | ||||||
|      ); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   if(isodd) { |  | ||||||
|     *result += input[num_points - 1] * taps[num_points - 1]; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   return; |  | ||||||
|  |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #if LV_HAVE_SSE && LV_HAVE_32 |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { |  | ||||||
|  |  | ||||||
|   volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); |  | ||||||
|  |  | ||||||
| #if 0 |  | ||||||
|   const unsigned int num_bytes = num_points*8; |  | ||||||
|   unsigned int isodd = num_points & 1; |  | ||||||
|  |  | ||||||
|   asm volatile |  | ||||||
|     ( |  | ||||||
|      "	#pushl	%%ebp\n\t" |  | ||||||
|      "	#movl	%%esp, %%ebp\n\t" |  | ||||||
|      "	movl	12(%%ebp), %%eax		# input\n\t" |  | ||||||
|      "	movl	16(%%ebp), %%edx		# taps\n\t" |  | ||||||
|      "	movl	20(%%ebp), %%ecx                # n_bytes\n\t" |  | ||||||
|      "	xorps	%%xmm6, %%xmm6		# zero accumulators\n\t" |  | ||||||
|      "	movaps	0(%%eax), %%xmm0\n\t" |  | ||||||
|      "	xorps	%%xmm7, %%xmm7		# zero accumulators\n\t" |  | ||||||
|      "	movaps	0(%%edx), %%xmm2\n\t" |  | ||||||
|      "	shrl	$5, %%ecx		# ecx = n_2_ccomplex_blocks / 2\n\t" |  | ||||||
|      "	jmp	.%=L1_test\n\t" |  | ||||||
|      "	# 4 taps / loop\n\t" |  | ||||||
|      "	# something like ?? cycles / loop\n\t" |  | ||||||
|      ".%=Loop1:	\n\t" |  | ||||||
|      "# complex prod: C += A * B,  w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" |  | ||||||
|      "#	movaps	(%%eax), %%xmmA\n\t" |  | ||||||
|      "#	movaps	(%%edx), %%xmmB\n\t" |  | ||||||
|      "#	movaps	%%xmmA, %%xmmZ\n\t" |  | ||||||
|      "#	shufps	$0xb1, %%xmmZ, %%xmmZ	# swap internals\n\t" |  | ||||||
|      "#	mulps	%%xmmB, %%xmmA\n\t" |  | ||||||
|      "#	mulps	%%xmmZ, %%xmmB\n\t" |  | ||||||
|      "#	# SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" |  | ||||||
|      "#	xorps	%%xmmPN, %%xmmA\n\t" |  | ||||||
|      "#	movaps	%%xmmA, %%xmmZ\n\t" |  | ||||||
|      "#	unpcklps %%xmmB, %%xmmA\n\t" |  | ||||||
|      "#	unpckhps %%xmmB, %%xmmZ\n\t" |  | ||||||
|      "#	movaps	%%xmmZ, %%xmmY\n\t" |  | ||||||
|      "#	shufps	$0x44, %%xmmA, %%xmmZ	# b01000100\n\t" |  | ||||||
|      "#	shufps	$0xee, %%xmmY, %%xmmA	# b11101110\n\t" |  | ||||||
|      "#	addps	%%xmmZ, %%xmmA\n\t" |  | ||||||
|      "#	addps	%%xmmA, %%xmmC\n\t" |  | ||||||
|      "# A=xmm0, B=xmm2, Z=xmm4\n\t" |  | ||||||
|      "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" |  | ||||||
|      "	movaps	16(%%eax), %%xmm1\n\t" |  | ||||||
|      "	movaps	%%xmm0, %%xmm4\n\t" |  | ||||||
|      "	mulps	%%xmm2, %%xmm0\n\t" |  | ||||||
|      "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t" |  | ||||||
|      "	movaps	16(%%edx), %%xmm3\n\t" |  | ||||||
|      "	movaps	%%xmm1, %%xmm5\n\t" |  | ||||||
|      "	addps	%%xmm0, %%xmm6\n\t" |  | ||||||
|      "	mulps	%%xmm3, %%xmm1\n\t" |  | ||||||
|      "	shufps	$0xb1, %%xmm5, %%xmm5	# swap internals\n\t" |  | ||||||
|      "	addps	%%xmm1, %%xmm6\n\t" |  | ||||||
|      "	mulps	%%xmm4, %%xmm2\n\t" |  | ||||||
|      "	movaps	32(%%eax), %%xmm0\n\t" |  | ||||||
|      "	addps	%%xmm2, %%xmm7\n\t" |  | ||||||
|      "	mulps	%%xmm5, %%xmm3\n\t" |  | ||||||
|      "	addl	$32, %%eax\n\t" |  | ||||||
|      "	movaps	32(%%edx), %%xmm2\n\t" |  | ||||||
|      "	addps	%%xmm3, %%xmm7\n\t" |  | ||||||
|      "	addl	$32, %%edx\n\t" |  | ||||||
|      ".%=L1_test:\n\t" |  | ||||||
|      "	decl	%%ecx\n\t" |  | ||||||
|      "	jge	.%=Loop1\n\t" |  | ||||||
|      "	# We've handled the bulk of multiplies up to here.\n\t" |  | ||||||
|      "	# Let's sse if original n_2_ccomplex_blocks was odd.\n\t" |  | ||||||
|      "	# If so, we've got 2 more taps to do.\n\t" |  | ||||||
|      "	movl	20(%%ebp), %%ecx		# n_2_ccomplex_blocks\n\t" |  | ||||||
|      "  shrl    $4, %%ecx\n\t" |  | ||||||
|      "	andl	$1, %%ecx\n\t" |  | ||||||
|      "	je	.%=Leven\n\t" |  | ||||||
|      "	# The count was odd, do 2 more taps.\n\t" |  | ||||||
|      "	# Note that we've already got mm0/mm2 preloaded\n\t" |  | ||||||
|      "	# from the main loop.\n\t" |  | ||||||
|      "	movaps	%%xmm0, %%xmm4\n\t" |  | ||||||
|      "	mulps	%%xmm2, %%xmm0\n\t" |  | ||||||
|      "	shufps	$0xb1, %%xmm4, %%xmm4	# swap internals\n\t" |  | ||||||
|      "	addps	%%xmm0, %%xmm6\n\t" |  | ||||||
|      "	mulps	%%xmm4, %%xmm2\n\t" |  | ||||||
|      "	addps	%%xmm2, %%xmm7\n\t" |  | ||||||
|      ".%=Leven:\n\t" |  | ||||||
|      "	# neg inversor\n\t" |  | ||||||
|      "  movl 8(%%ebp), %%eax \n\t" |  | ||||||
|      "	xorps	%%xmm1, %%xmm1\n\t" |  | ||||||
|      "  movl	$0x80000000, (%%eax)\n\t" |  | ||||||
|      "	movss	(%%eax), %%xmm1\n\t" |  | ||||||
|      "	shufps	$0x11, %%xmm1, %%xmm1	# b00010001 # 0 -0 0 -0\n\t" |  | ||||||
|      "	# pfpnacc\n\t" |  | ||||||
|      "	xorps	%%xmm1, %%xmm6\n\t" |  | ||||||
|      "	movaps	%%xmm6, %%xmm2\n\t" |  | ||||||
|      "	unpcklps %%xmm7, %%xmm6\n\t" |  | ||||||
|      "	unpckhps %%xmm7, %%xmm2\n\t" |  | ||||||
|      "	movaps	%%xmm2, %%xmm3\n\t" |  | ||||||
|      "	shufps	$0x44, %%xmm6, %%xmm2	# b01000100\n\t" |  | ||||||
|      "	shufps	$0xee, %%xmm3, %%xmm6	# b11101110\n\t" |  | ||||||
|      "	addps	%%xmm2, %%xmm6\n\t" |  | ||||||
|      "					# xmm6 = r1 i2 r3 i4\n\t" |  | ||||||
|      "	#movl	8(%%ebp), %%eax		# @result\n\t" |  | ||||||
|      "	movhlps	%%xmm6, %%xmm4		# xmm4 = r3 i4 ?? ??\n\t" |  | ||||||
|      "	addps	%%xmm4, %%xmm6		# xmm6 = r1+r3 i2+i4 ?? ??\n\t" |  | ||||||
|      "	movlps	%%xmm6, (%%eax)		# store low 2x32 bits (complex) to memory\n\t" |  | ||||||
|      "	#popl	%%ebp\n\t" |  | ||||||
|      : |  | ||||||
|      : |  | ||||||
|      : "eax", "ecx", "edx" |  | ||||||
|      ); |  | ||||||
|  |  | ||||||
|  |  | ||||||
|   int getem = num_bytes % 16; |  | ||||||
|  |  | ||||||
|   if(isodd) { |  | ||||||
|     *result += (input[num_points - 1] * taps[num_points - 1]); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   return; |  | ||||||
| #endif |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_SSE*/ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { |  | ||||||
|  |  | ||||||
|   const unsigned int num_bytes = num_points*8; |  | ||||||
|   unsigned int isodd = num_points & 1; |  | ||||||
|  |  | ||||||
|   lv_32fc_t dotProduct; |  | ||||||
|   memset(&dotProduct, 0x0, 2*sizeof(float)); |  | ||||||
|  |  | ||||||
|   unsigned int number = 0; |  | ||||||
|   const unsigned int halfPoints = num_bytes >> 4; |  | ||||||
|  |  | ||||||
|   __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; |  | ||||||
|  |  | ||||||
|   const lv_32fc_t* a = input; |  | ||||||
|   const lv_32fc_t* b = taps; |  | ||||||
|  |  | ||||||
|   dotProdVal = _mm_setzero_ps(); |  | ||||||
|  |  | ||||||
|   for(;number < halfPoints; number++){ |  | ||||||
|  |  | ||||||
|     x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi |  | ||||||
|     y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di |  | ||||||
|  |  | ||||||
|     yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr |  | ||||||
|     yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di |  | ||||||
|  |  | ||||||
|     tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |  | ||||||
|  |  | ||||||
|     x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br |  | ||||||
|  |  | ||||||
|     tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di |  | ||||||
|  |  | ||||||
|     z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |  | ||||||
|  |  | ||||||
|     dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together |  | ||||||
|  |  | ||||||
|     a += 2; |  | ||||||
|     b += 2; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; |  | ||||||
|  |  | ||||||
|   _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector |  | ||||||
|  |  | ||||||
|   dotProduct += ( dotProductVector[0] + dotProductVector[1] ); |  | ||||||
|  |  | ||||||
|   if(isodd) { |  | ||||||
|     dotProduct += input[num_points - 1] * taps[num_points - 1]; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   *result = dotProduct; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_SSE3*/ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE4_1 |  | ||||||
| #include <smmintrin.h> |  | ||||||
|  |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { |  | ||||||
|  |  | ||||||
|   unsigned int i = 0; |  | ||||||
|   const unsigned int qtr_points = num_points/4; |  | ||||||
|   const unsigned int isodd = num_points & 3; |  | ||||||
|  |  | ||||||
|   __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; |  | ||||||
|   float *p_input, *p_taps; |  | ||||||
|   __m64 *p_result; |  | ||||||
|  |  | ||||||
|   static const __m128i neg = {0x000000000000000080000000}; |  | ||||||
|  |  | ||||||
|   p_result = (__m64*)result; |  | ||||||
|   p_input = (float*)input; |  | ||||||
|   p_taps = (float*)taps; |  | ||||||
|  |  | ||||||
|   real0 = _mm_setzero_ps(); |  | ||||||
|   real1 = _mm_setzero_ps(); |  | ||||||
|   im0 = _mm_setzero_ps(); |  | ||||||
|   im1 = _mm_setzero_ps(); |  | ||||||
|  |  | ||||||
|   for(; i < qtr_points; ++i) { |  | ||||||
|     xmm0 = _mm_load_ps(p_input); |  | ||||||
|     xmm1 = _mm_load_ps(p_taps); |  | ||||||
|  |  | ||||||
|     p_input += 4; |  | ||||||
|     p_taps += 4; |  | ||||||
|  |  | ||||||
|     xmm2 = _mm_load_ps(p_input); |  | ||||||
|     xmm3 = _mm_load_ps(p_taps); |  | ||||||
|  |  | ||||||
|     p_input += 4; |  | ||||||
|     p_taps += 4; |  | ||||||
|  |  | ||||||
|     xmm4 = _mm_unpackhi_ps(xmm0, xmm2); |  | ||||||
|     xmm5 = _mm_unpackhi_ps(xmm1, xmm3); |  | ||||||
|     xmm0 = _mm_unpacklo_ps(xmm0, xmm2); |  | ||||||
|     xmm2 = _mm_unpacklo_ps(xmm1, xmm3); |  | ||||||
|  |  | ||||||
|     //imaginary vector from input |  | ||||||
|     xmm1 = _mm_unpackhi_ps(xmm0, xmm4); |  | ||||||
|     //real vector from input |  | ||||||
|     xmm3 = _mm_unpacklo_ps(xmm0, xmm4); |  | ||||||
|     //imaginary vector from taps |  | ||||||
|     xmm0 = _mm_unpackhi_ps(xmm2, xmm5); |  | ||||||
|     //real vector from taps |  | ||||||
|     xmm2 = _mm_unpacklo_ps(xmm2, xmm5); |  | ||||||
|  |  | ||||||
|     xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); |  | ||||||
|     xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); |  | ||||||
|  |  | ||||||
|     xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); |  | ||||||
|     xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); |  | ||||||
|  |  | ||||||
|     real0 = _mm_add_ps(xmm4, real0); |  | ||||||
|     real1 = _mm_add_ps(xmm5, real1); |  | ||||||
|     im0 = _mm_add_ps(xmm6, im0); |  | ||||||
|     im1 = _mm_add_ps(xmm7, im1); |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); |  | ||||||
|  |  | ||||||
|   im0 = _mm_add_ps(im0, im1); |  | ||||||
|   real0 = _mm_add_ps(real0, real1); |  | ||||||
|  |  | ||||||
|   im0 = _mm_add_ps(im0, real0); |  | ||||||
|  |  | ||||||
|   _mm_storel_pi(p_result, im0); |  | ||||||
|  |  | ||||||
|   for(i = num_points-isodd; i < num_points; i++) { |  | ||||||
|     *result += input[i] * taps[i]; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /*LV_HAVE_SSE4_1*/ |  | ||||||
|  |  | ||||||
| #endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/ |  | ||||||
| @@ -1,170 +0,0 @@ | |||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> |  | ||||||
| #include <float.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies the two input complex vectors and stores their results in the third vector |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector One of the vectors to be multiplied |  | ||||||
|     \param bVector One of the vectors to be multiplied |  | ||||||
|     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ |  | ||||||
|   unsigned int number = 0; |  | ||||||
|     const unsigned int halfPoints = num_points / 2; |  | ||||||
|  |  | ||||||
|     __m128 x, y, yl, yh, z, tmp1, tmp2; |  | ||||||
|     lv_32fc_t* c = cVector; |  | ||||||
|     const lv_32fc_t* a = aVector; |  | ||||||
|     const lv_32fc_t* b = bVector; |  | ||||||
|  |  | ||||||
|     for(;number < halfPoints; number++){ |  | ||||||
|  |  | ||||||
|       x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi |  | ||||||
|       y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di |  | ||||||
|  |  | ||||||
|       yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr |  | ||||||
|       yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di |  | ||||||
|  |  | ||||||
|       tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |  | ||||||
|  |  | ||||||
|       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br |  | ||||||
|  |  | ||||||
|       tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di |  | ||||||
|  |  | ||||||
|       z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |  | ||||||
|  |  | ||||||
|       _mm_storeu_ps((float*)c,z); // Store the results back into the C container |  | ||||||
|  |  | ||||||
|       a += 2; |  | ||||||
|       b += 2; |  | ||||||
|       c += 2; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if((num_points % 2) != 0) { |  | ||||||
|       *c = (*a) * (*b); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies the two input complex vectors and stores their results in the third vector |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector One of the vectors to be multiplied |  | ||||||
|     \param bVector One of the vectors to be multiplied |  | ||||||
|     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ |  | ||||||
|     lv_32fc_t* cPtr = cVector; |  | ||||||
|     const lv_32fc_t* aPtr = aVector; |  | ||||||
|     const lv_32fc_t* bPtr=  bVector; |  | ||||||
|     unsigned int number = 0; |  | ||||||
|  |  | ||||||
|     for(number = 0; number < num_points; number++){ |  | ||||||
|       *cPtr++ = (*aPtr++) * (*bPtr++); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ |  | ||||||
| #ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H |  | ||||||
| #define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H |  | ||||||
|  |  | ||||||
| #include <inttypes.h> |  | ||||||
| #include <stdio.h> |  | ||||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> |  | ||||||
| #include <float.h> |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_SSE3 |  | ||||||
| #include <pmmintrin.h> |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies the two input complex vectors and stores their results in the third vector |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector One of the vectors to be multiplied |  | ||||||
|     \param bVector One of the vectors to be multiplied |  | ||||||
|     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ |  | ||||||
|   unsigned int number = 0; |  | ||||||
|     const unsigned int halfPoints = num_points / 2; |  | ||||||
|  |  | ||||||
|     __m128 x, y, yl, yh, z, tmp1, tmp2; |  | ||||||
|     lv_32fc_t* c = cVector; |  | ||||||
|     const lv_32fc_t* a = aVector; |  | ||||||
|     const lv_32fc_t* b = bVector; |  | ||||||
|     for(;number < halfPoints; number++){ |  | ||||||
|  |  | ||||||
|       x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi |  | ||||||
|       y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di |  | ||||||
|  |  | ||||||
|       yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr |  | ||||||
|       yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di |  | ||||||
|  |  | ||||||
|       tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr |  | ||||||
|  |  | ||||||
|       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br |  | ||||||
|  |  | ||||||
|       tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di |  | ||||||
|  |  | ||||||
|       z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di |  | ||||||
|  |  | ||||||
|       _mm_store_ps((float*)c,z); // Store the results back into the C container |  | ||||||
|  |  | ||||||
|       a += 2; |  | ||||||
|       b += 2; |  | ||||||
|       c += 2; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if((num_points % 2) != 0) { |  | ||||||
|       *c = (*a) * (*b); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_SSE */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_GENERIC |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies the two input complex vectors and stores their results in the third vector |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector One of the vectors to be multiplied |  | ||||||
|     \param bVector One of the vectors to be multiplied |  | ||||||
|     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector |  | ||||||
|   */ |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ |  | ||||||
|     lv_32fc_t* cPtr = cVector; |  | ||||||
|     const lv_32fc_t* aPtr = aVector; |  | ||||||
|     const lv_32fc_t* bPtr=  bVector; |  | ||||||
|     unsigned int number = 0; |  | ||||||
|  |  | ||||||
|     for(number = 0; number < num_points; number++){ |  | ||||||
|       *cPtr++ = (*aPtr++) * (*bPtr++); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_GENERIC */ |  | ||||||
|  |  | ||||||
| #ifdef LV_HAVE_ORC |  | ||||||
|   /*! |  | ||||||
|     \brief Multiplies the two input complex vectors and stores their results in the third vector |  | ||||||
|     \param cVector The vector where the results will be stored |  | ||||||
|     \param aVector One of the vectors to be multiplied |  | ||||||
|     \param bVector One of the vectors to be multiplied |  | ||||||
|     \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector |  | ||||||
|   */ |  | ||||||
| extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); |  | ||||||
| static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ |  | ||||||
|     volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); |  | ||||||
| } |  | ||||||
| #endif /* LV_HAVE_ORC */ |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| #endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ |  | ||||||
| @@ -1,3 +1,49 @@ | |||||||
|  | /*! | ||||||
|  |  * \file volk_gnsssdr_32fc_x5_cw_vepl_corr_32fc_x5 | ||||||
|  |  * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt and Late correlation with 64 bits vectors | ||||||
|  |  * \authors <ul> | ||||||
|  |  *          <li>Javier Arribas, 2011. jarribas(at)cttc.es | ||||||
|  |  *          <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com | ||||||
|  |  *          </ul> | ||||||
|  |  * | ||||||
|  |  * Volk protokernel that performs the carrier wipe-off mixing and the | ||||||
|  |  * Early, Prompt and Late correlation with 64 bits vectors (32 bits the | ||||||
|  |  * real part and 32 bits the imaginary part): | ||||||
|  |  * - The carrier wipe-off is done by multiplying the input signal by the | ||||||
|  |  * carrier (multiplication of 64 bits vectors) It returns the input | ||||||
|  |  * signal in base band (BB) | ||||||
|  |  * - Early values are calculated by multiplying the input signal in BB by the | ||||||
|  |  * early code (multiplication of 64 bits vectors), accumulating the results | ||||||
|  |  * - Prompt values are calculated by multiplying the input signal in BB by the | ||||||
|  |  * prompt code (multiplication of 64 bits vectors), accumulating the results | ||||||
|  |  * - Late values are calculated by multiplying the input signal in BB by the | ||||||
|  |  * late code (multiplication of 64 bits vectors), accumulating the results | ||||||
|  |  * | ||||||
|  |  * ------------------------------------------------------------------------- | ||||||
|  |  * | ||||||
|  |  * Copyright (C) 2010-2014  (see AUTHORS file for a list of contributors) | ||||||
|  |  * | ||||||
|  |  * GNSS-SDR is a software defined Global Navigation | ||||||
|  |  *          Satellite Systems receiver | ||||||
|  |  * | ||||||
|  |  * This file is part of GNSS-SDR. | ||||||
|  |  * | ||||||
|  |  * GNSS-SDR is free software: you can redistribute it and/or modify | ||||||
|  |  * it under the terms of the GNU General Public License as published by | ||||||
|  |  * the Free Software Foundation, either version 3 of the License, or | ||||||
|  |  * at your option) any later version. | ||||||
|  |  * | ||||||
|  |  * GNSS-SDR is distributed in the hope that it will be useful, | ||||||
|  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |  * GNU General Public License for more details. | ||||||
|  |  * | ||||||
|  |  * You should have received a copy of the GNU General Public License | ||||||
|  |  * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>. | ||||||
|  |  * | ||||||
|  |  * ------------------------------------------------------------------------- | ||||||
|  |  */ | ||||||
|  |  | ||||||
| #ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H | #ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H | ||||||
| #define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H | #define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H | ||||||
|  |  | ||||||
|   | |||||||
| @@ -24,17 +24,6 @@ | |||||||
| #include <volk_gnsssdr/volk_gnsssdr.h> | #include <volk_gnsssdr/volk_gnsssdr.h> | ||||||
| #include <boost/test/unit_test.hpp> | #include <boost/test/unit_test.hpp> | ||||||
|  |  | ||||||
| //VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1); |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1); |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1); |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1); |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1); |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1); |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1); |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1); |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1); |  | ||||||
|  |  | ||||||
| //GNSS-SDR PROTO-KERNELS | //GNSS-SDR PROTO-KERNELS | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1); | VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1); | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1); | VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1); | ||||||
| @@ -52,7 +41,6 @@ VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1); | |||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1); | VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1); | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1); | VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1); | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1); | VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1); | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1); |  | ||||||
|  |  | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); | VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); | ||||||
| VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1); | VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1); | ||||||
|   | |||||||
| @@ -244,7 +244,6 @@ void galileo_volk_e1_dll_pll_veml_tracking_cc::update_local_code() | |||||||
| { | { | ||||||
|     double tcode_half_chips; |     double tcode_half_chips; | ||||||
|     float rem_code_phase_half_chips; |     float rem_code_phase_half_chips; | ||||||
|     int associated_chip_index; |  | ||||||
|     int code_length_half_chips = static_cast<int>(Galileo_E1_B_CODE_LENGTH_CHIPS) * 2; |     int code_length_half_chips = static_cast<int>(Galileo_E1_B_CODE_LENGTH_CHIPS) * 2; | ||||||
|     double code_phase_step_chips; |     double code_phase_step_chips; | ||||||
|     double code_phase_step_half_chips; |     double code_phase_step_half_chips; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 andres
					andres