mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-04-19 01:03:15 +00:00
Added cmake version check. Deleted original volk
Added cmake version checks. Deleted original volk protokernels. Fixes.
This commit is contained in:
parent
4128b059b1
commit
db738c7ea9
CMakeLists.txt
src/algorithms
libs/volk_gnsssdr
apps
kernels/volk_gnsssdr
volk_gnsssdr_16i_s32f_convert_32f.hvolk_gnsssdr_32f_accumulator_s32f.hvolk_gnsssdr_32f_index_max_16u.hvolk_gnsssdr_32f_s32f_convert_16i.hvolk_gnsssdr_32f_x2_add_32f.hvolk_gnsssdr_32fc_conjugate_32fc.hvolk_gnsssdr_32fc_magnitude_squared_32f.hvolk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.hvolk_gnsssdr_32fc_s32fc_multiply_32fc.hvolk_gnsssdr_32fc_x2_dot_prod_32fc.hvolk_gnsssdr_32fc_x2_multiply_32fc.hvolk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h
lib
tracking/gnuradio_blocks
@ -205,6 +205,14 @@ set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "")
|
||||
# Append -O2 optimization flag for Debug builds
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O2")
|
||||
|
||||
################################################################################
|
||||
# Checkout cmake version
|
||||
################################################################################
|
||||
if(CMAKE_VERSION VERSION_LESS 2.8.8)
|
||||
message(STATUS "Your CMake version is too old and does not support some features required by GNSS-SDR. CMake version must be at least 2.8.8. For more information check https://github.com/joakimkarlsson/bandit/issues/40")
|
||||
message(FATAL_ERROR "Fatal error: CMake >= 2.8.8 required.")
|
||||
endif(CMAKE_VERSION VERSION_LESS 2.8.8)
|
||||
|
||||
################################################################################
|
||||
# Checkout compiler version
|
||||
################################################################################
|
||||
|
@ -179,27 +179,18 @@ int main(int argc, char *argv[]) {
|
||||
VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex);
|
||||
|
||||
VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
|
||||
VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
|
||||
|
||||
// Until we can update the config on a kernel by kernel basis
|
||||
// do not overwrite volk_config when using a regex.
|
||||
|
@ -1,241 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
|
||||
#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
|
||||
/*!
|
||||
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
|
||||
\param inputVector The 16 bit input data buffer
|
||||
\param outputVector The floating point output data buffer
|
||||
\param scalar The value divided against each point in the output buffer
|
||||
\param num_points The number of data values to be converted
|
||||
\note Output buffer does NOT need to be properly aligned
|
||||
*/
|
||||
static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int eighthPoints = num_points / 8;
|
||||
|
||||
float* outputVectorPtr = outputVector;
|
||||
__m128 invScalar = _mm_set_ps1(1.0/scalar);
|
||||
int16_t* inputPtr = (int16_t*)inputVector;
|
||||
__m128i inputVal;
|
||||
__m128i inputVal2;
|
||||
__m128 ret;
|
||||
|
||||
for(;number < eighthPoints; number++){
|
||||
|
||||
// Load the 8 values
|
||||
inputVal = _mm_loadu_si128((__m128i*)inputPtr);
|
||||
|
||||
// Shift the input data to the right by 64 bits ( 8 bytes )
|
||||
inputVal2 = _mm_srli_si128(inputVal, 8);
|
||||
|
||||
// Convert the lower 4 values into 32 bit words
|
||||
inputVal = _mm_cvtepi16_epi32(inputVal);
|
||||
inputVal2 = _mm_cvtepi16_epi32(inputVal2);
|
||||
|
||||
ret = _mm_cvtepi32_ps(inputVal);
|
||||
ret = _mm_mul_ps(ret, invScalar);
|
||||
_mm_storeu_ps(outputVectorPtr, ret);
|
||||
outputVectorPtr += 4;
|
||||
|
||||
ret = _mm_cvtepi32_ps(inputVal2);
|
||||
ret = _mm_mul_ps(ret, invScalar);
|
||||
_mm_storeu_ps(outputVectorPtr, ret);
|
||||
|
||||
outputVectorPtr += 4;
|
||||
|
||||
inputPtr += 8;
|
||||
}
|
||||
|
||||
number = eighthPoints * 8;
|
||||
for(; number < num_points; number++){
|
||||
outputVector[number] =((float)(inputVector[number])) / scalar;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
|
||||
/*!
|
||||
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
|
||||
\param inputVector The 16 bit input data buffer
|
||||
\param outputVector The floating point output data buffer
|
||||
\param scalar The value divided against each point in the output buffer
|
||||
\param num_points The number of data values to be converted
|
||||
\note Output buffer does NOT need to be properly aligned
|
||||
*/
|
||||
static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
float* outputVectorPtr = outputVector;
|
||||
__m128 invScalar = _mm_set_ps1(1.0/scalar);
|
||||
int16_t* inputPtr = (int16_t*)inputVector;
|
||||
__m128 ret;
|
||||
|
||||
for(;number < quarterPoints; number++){
|
||||
ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
|
||||
|
||||
ret = _mm_mul_ps(ret, invScalar);
|
||||
_mm_storeu_ps(outputVectorPtr, ret);
|
||||
|
||||
inputPtr += 4;
|
||||
outputVectorPtr += 4;
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(; number < num_points; number++){
|
||||
outputVector[number] = (float)(inputVector[number]) / scalar;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
|
||||
\param inputVector The 16 bit input data buffer
|
||||
\param outputVector The floating point output data buffer
|
||||
\param scalar The value divided against each point in the output buffer
|
||||
\param num_points The number of data values to be converted
|
||||
\note Output buffer does NOT need to be properly aligned
|
||||
*/
|
||||
static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
|
||||
float* outputVectorPtr = outputVector;
|
||||
const int16_t* inputVectorPtr = inputVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */
|
||||
#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
|
||||
#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
|
||||
/*!
|
||||
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
|
||||
\param inputVector The 16 bit input data buffer
|
||||
\param outputVector The floating point output data buffer
|
||||
\param scalar The value divided against each point in the output buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int eighthPoints = num_points / 8;
|
||||
|
||||
float* outputVectorPtr = outputVector;
|
||||
__m128 invScalar = _mm_set_ps1(1.0/scalar);
|
||||
int16_t* inputPtr = (int16_t*)inputVector;
|
||||
__m128i inputVal;
|
||||
__m128i inputVal2;
|
||||
__m128 ret;
|
||||
|
||||
for(;number < eighthPoints; number++){
|
||||
|
||||
// Load the 8 values
|
||||
inputVal = _mm_loadu_si128((__m128i*)inputPtr);
|
||||
|
||||
// Shift the input data to the right by 64 bits ( 8 bytes )
|
||||
inputVal2 = _mm_srli_si128(inputVal, 8);
|
||||
|
||||
// Convert the lower 4 values into 32 bit words
|
||||
inputVal = _mm_cvtepi16_epi32(inputVal);
|
||||
inputVal2 = _mm_cvtepi16_epi32(inputVal2);
|
||||
|
||||
ret = _mm_cvtepi32_ps(inputVal);
|
||||
ret = _mm_mul_ps(ret, invScalar);
|
||||
_mm_storeu_ps(outputVectorPtr, ret);
|
||||
outputVectorPtr += 4;
|
||||
|
||||
ret = _mm_cvtepi32_ps(inputVal2);
|
||||
ret = _mm_mul_ps(ret, invScalar);
|
||||
_mm_storeu_ps(outputVectorPtr, ret);
|
||||
|
||||
outputVectorPtr += 4;
|
||||
|
||||
inputPtr += 8;
|
||||
}
|
||||
|
||||
number = eighthPoints * 8;
|
||||
for(; number < num_points; number++){
|
||||
outputVector[number] =((float)(inputVector[number])) / scalar;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE4_1 */
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
|
||||
/*!
|
||||
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
|
||||
\param inputVector The 16 bit input data buffer
|
||||
\param outputVector The floating point output data buffer
|
||||
\param scalar The value divided against each point in the output buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
float* outputVectorPtr = outputVector;
|
||||
__m128 invScalar = _mm_set_ps1(1.0/scalar);
|
||||
int16_t* inputPtr = (int16_t*)inputVector;
|
||||
__m128 ret;
|
||||
|
||||
for(;number < quarterPoints; number++){
|
||||
ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
|
||||
|
||||
ret = _mm_mul_ps(ret, invScalar);
|
||||
_mm_storeu_ps(outputVectorPtr, ret);
|
||||
|
||||
inputPtr += 4;
|
||||
outputVectorPtr += 4;
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(; number < num_points; number++){
|
||||
outputVector[number] = (float)(inputVector[number]) / scalar;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
|
||||
\param inputVector The 16 bit input data buffer
|
||||
\param outputVector The floating point output data buffer
|
||||
\param scalar The value divided against each point in the output buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
|
||||
float* outputVectorPtr = outputVector;
|
||||
const int16_t* inputVectorPtr = inputVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */
|
@ -1,68 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
|
||||
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
/*!
|
||||
\brief Accumulates the values in the input buffer
|
||||
\param result The accumulated result
|
||||
\param inputBuffer The buffer of data to be accumulated
|
||||
\param num_points The number of values in inputBuffer to be accumulated
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){
|
||||
float returnValue = 0;
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const float* aPtr = inputBuffer;
|
||||
__VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
|
||||
|
||||
__m128 accumulator = _mm_setzero_ps();
|
||||
__m128 aVal = _mm_setzero_ps();
|
||||
|
||||
for(;number < quarterPoints; number++){
|
||||
aVal = _mm_load_ps(aPtr);
|
||||
accumulator = _mm_add_ps(accumulator, aVal);
|
||||
aPtr += 4;
|
||||
}
|
||||
_mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
|
||||
returnValue = tempBuffer[0];
|
||||
returnValue += tempBuffer[1];
|
||||
returnValue += tempBuffer[2];
|
||||
returnValue += tempBuffer[3];
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(;number < num_points; number++){
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
*result = returnValue;
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Accumulates the values in the input buffer
|
||||
\param result The accumulated result
|
||||
\param inputBuffer The buffer of data to be accumulated
|
||||
\param num_points The number of values in inputBuffer to be accumulated
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){
|
||||
const float* aPtr = inputBuffer;
|
||||
unsigned int number = 0;
|
||||
float returnValue = 0;
|
||||
|
||||
for(;number < num_points; number++){
|
||||
returnValue += (*aPtr++);
|
||||
}
|
||||
*result = returnValue;
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */
|
@ -1,149 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
|
||||
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
|
||||
if(num_points > 0){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
float* inputPtr = (float*)src0;
|
||||
|
||||
__m128 indexIncrementValues = _mm_set1_ps(4);
|
||||
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
|
||||
|
||||
float max = src0[0];
|
||||
float index = 0;
|
||||
__m128 maxValues = _mm_set1_ps(max);
|
||||
__m128 maxValuesIndex = _mm_setzero_ps();
|
||||
__m128 compareResults;
|
||||
__m128 currentValues;
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
|
||||
|
||||
for(;number < quarterPoints; number++){
|
||||
|
||||
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
|
||||
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
||||
|
||||
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
||||
|
||||
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
|
||||
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
|
||||
}
|
||||
|
||||
// Calculate the largest value from the remaining 4 points
|
||||
_mm_store_ps(maxValuesBuffer, maxValues);
|
||||
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
||||
|
||||
for(number = 0; number < 4; number++){
|
||||
if(maxValuesBuffer[number] > max){
|
||||
index = maxIndexesBuffer[number];
|
||||
max = maxValuesBuffer[number];
|
||||
}
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(;number < num_points; number++){
|
||||
if(src0[number] > max){
|
||||
index = number;
|
||||
max = src0[number];
|
||||
}
|
||||
}
|
||||
target[0] = (unsigned int)index;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE4_1*/
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include<xmmintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) {
|
||||
if(num_points > 0){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
float* inputPtr = (float*)src0;
|
||||
|
||||
__m128 indexIncrementValues = _mm_set1_ps(4);
|
||||
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
|
||||
|
||||
float max = src0[0];
|
||||
float index = 0;
|
||||
__m128 maxValues = _mm_set1_ps(max);
|
||||
__m128 maxValuesIndex = _mm_setzero_ps();
|
||||
__m128 compareResults;
|
||||
__m128 currentValues;
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
|
||||
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
|
||||
|
||||
for(;number < quarterPoints; number++){
|
||||
|
||||
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
|
||||
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
||||
|
||||
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
||||
|
||||
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
|
||||
|
||||
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
|
||||
}
|
||||
|
||||
// Calculate the largest value from the remaining 4 points
|
||||
_mm_store_ps(maxValuesBuffer, maxValues);
|
||||
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
||||
|
||||
for(number = 0; number < 4; number++){
|
||||
if(maxValuesBuffer[number] > max){
|
||||
index = maxIndexesBuffer[number];
|
||||
max = maxValuesBuffer[number];
|
||||
}
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(;number < num_points; number++){
|
||||
if(src0[number] > max){
|
||||
index = number;
|
||||
max = src0[number];
|
||||
}
|
||||
}
|
||||
target[0] = (unsigned int)index;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE*/
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
|
||||
if(num_points > 0){
|
||||
float max = src0[0];
|
||||
unsigned int index = 0;
|
||||
|
||||
unsigned int i = 1;
|
||||
|
||||
for(; i < num_points; ++i) {
|
||||
|
||||
if(src0[i] > max){
|
||||
index = i;
|
||||
max = src0[i];
|
||||
}
|
||||
|
||||
}
|
||||
target[0] = index;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
|
||||
|
||||
#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/
|
@ -1,302 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
|
||||
#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE2
|
||||
#include <emmintrin.h>
|
||||
/*!
|
||||
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param scalar The value multiplied against each point in the input buffer
|
||||
\param num_points The number of data values to be converted
|
||||
\note Input buffer does NOT need to be properly aligned
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
|
||||
const unsigned int eighthPoints = num_points / 8;
|
||||
|
||||
const float* inputVectorPtr = (const float*)inputVector;
|
||||
int16_t* outputVectorPtr = outputVector;
|
||||
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
float r;
|
||||
|
||||
__m128 vScalar = _mm_set_ps1(scalar);
|
||||
__m128 inputVal1, inputVal2;
|
||||
__m128i intInputVal1, intInputVal2;
|
||||
__m128 ret1, ret2;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(;number < eighthPoints; number++){
|
||||
inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Scale and clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
number = eighthPoints * 8;
|
||||
for(; number < num_points; number++){
|
||||
r = inputVector[number] * scalar;
|
||||
if(r > max_val)
|
||||
r = max_val;
|
||||
else if(r < min_val)
|
||||
r = min_val;
|
||||
outputVector[number] = (int16_t)rintf(r);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
/*!
|
||||
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param scalar The value multiplied against each point in the input buffer
|
||||
\param num_points The number of data values to be converted
|
||||
\note Input buffer does NOT need to be properly aligned
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const float* inputVectorPtr = (const float*)inputVector;
|
||||
int16_t* outputVectorPtr = outputVector;
|
||||
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
float r;
|
||||
|
||||
__m128 vScalar = _mm_set_ps1(scalar);
|
||||
__m128 ret;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
|
||||
|
||||
for(;number < quarterPoints; number++){
|
||||
ret = _mm_loadu_ps(inputVectorPtr);
|
||||
inputVectorPtr += 4;
|
||||
|
||||
// Scale and clip
|
||||
ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
|
||||
|
||||
_mm_store_ps(outputFloatBuffer, ret);
|
||||
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(; number < num_points; number++){
|
||||
r = inputVector[number] * scalar;
|
||||
if(r > max_val)
|
||||
r = max_val;
|
||||
else if(r < min_val)
|
||||
r = min_val;
|
||||
outputVector[number] = (int16_t)rintf(r);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param scalar The value multiplied against each point in the input buffer
|
||||
\param num_points The number of data values to be converted
|
||||
\note Input buffer does NOT need to be properly aligned
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
|
||||
int16_t* outputVectorPtr = outputVector;
|
||||
const float* inputVectorPtr = inputVector;
|
||||
unsigned int number = 0;
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
float r;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
r = *inputVectorPtr++ * scalar;
|
||||
if(r > max_val)
|
||||
r = max_val;
|
||||
else if(r < min_val)
|
||||
r = min_val;
|
||||
*outputVectorPtr++ = (int16_t)rintf(r);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */
|
||||
#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
|
||||
|
||||
#include <volk/volk_common.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE2
|
||||
#include <emmintrin.h>
|
||||
/*!
|
||||
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param scalar The value multiplied against each point in the input buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
|
||||
const unsigned int eighthPoints = num_points / 8;
|
||||
|
||||
const float* inputVectorPtr = (const float*)inputVector;
|
||||
int16_t* outputVectorPtr = outputVector;
|
||||
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
float r;
|
||||
|
||||
__m128 vScalar = _mm_set_ps1(scalar);
|
||||
__m128 inputVal1, inputVal2;
|
||||
__m128i intInputVal1, intInputVal2;
|
||||
__m128 ret1, ret2;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
for(;number < eighthPoints; number++){
|
||||
inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
|
||||
inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
|
||||
|
||||
// Scale and clip
|
||||
ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
|
||||
ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
|
||||
|
||||
intInputVal1 = _mm_cvtps_epi32(ret1);
|
||||
intInputVal2 = _mm_cvtps_epi32(ret2);
|
||||
|
||||
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
|
||||
|
||||
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
|
||||
outputVectorPtr += 8;
|
||||
}
|
||||
|
||||
number = eighthPoints * 8;
|
||||
for(; number < num_points; number++){
|
||||
r = inputVector[number] * scalar;
|
||||
if(r > max_val)
|
||||
r = max_val;
|
||||
else if(r < min_val)
|
||||
r = min_val;
|
||||
outputVector[number] = (int16_t)rintf(r);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE2 */
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
/*!
|
||||
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param scalar The value multiplied against each point in the input buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const float* inputVectorPtr = (const float*)inputVector;
|
||||
int16_t* outputVectorPtr = outputVector;
|
||||
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
float r;
|
||||
|
||||
__m128 vScalar = _mm_set_ps1(scalar);
|
||||
__m128 ret;
|
||||
__m128 vmin_val = _mm_set_ps1(min_val);
|
||||
__m128 vmax_val = _mm_set_ps1(max_val);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
|
||||
|
||||
for(;number < quarterPoints; number++){
|
||||
ret = _mm_load_ps(inputVectorPtr);
|
||||
inputVectorPtr += 4;
|
||||
|
||||
// Scale and clip
|
||||
ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
|
||||
|
||||
_mm_store_ps(outputFloatBuffer, ret);
|
||||
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
|
||||
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(; number < num_points; number++){
|
||||
r = inputVector[number] * scalar;
|
||||
if(r > max_val)
|
||||
r = max_val;
|
||||
else if(r < min_val)
|
||||
r = min_val;
|
||||
outputVector[number] = (int16_t)rintf(r);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
|
||||
\param inputVector The floating point input data buffer
|
||||
\param outputVector The 16 bit output data buffer
|
||||
\param scalar The value multiplied against each point in the input buffer
|
||||
\param num_points The number of data values to be converted
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
|
||||
int16_t* outputVectorPtr = outputVector;
|
||||
const float* inputVectorPtr = inputVector;
|
||||
unsigned int number = 0;
|
||||
float min_val = -32768;
|
||||
float max_val = 32767;
|
||||
float r;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
r = *inputVectorPtr++ * scalar;
|
||||
if(r < min_val)
|
||||
r = min_val;
|
||||
else if(r > max_val)
|
||||
r = max_val;
|
||||
*outputVectorPtr++ = (int16_t)rintf(r);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */
|
@ -1,147 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
|
||||
#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
/*!
|
||||
\brief Adds the two input vectors and store their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be added
|
||||
\param bVector One of the vectors to be added
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
float* cPtr = cVector;
|
||||
const float* aPtr = aVector;
|
||||
const float* bPtr= bVector;
|
||||
|
||||
__m128 aVal, bVal, cVal;
|
||||
for(;number < quarterPoints; number++){
|
||||
|
||||
aVal = _mm_loadu_ps(aPtr);
|
||||
bVal = _mm_loadu_ps(bPtr);
|
||||
|
||||
cVal = _mm_add_ps(aVal, bVal);
|
||||
|
||||
_mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
|
||||
|
||||
aPtr += 4;
|
||||
bPtr += 4;
|
||||
cPtr += 4;
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(;number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Adds the two input vectors and store their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be added
|
||||
\param bVector One of the vectors to be added
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
|
||||
float* cPtr = cVector;
|
||||
const float* aPtr = aVector;
|
||||
const float* bPtr= bVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */
|
||||
#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
/*!
|
||||
\brief Adds the two input vectors and store their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be added
|
||||
\param bVector One of the vectors to be added
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
float* cPtr = cVector;
|
||||
const float* aPtr = aVector;
|
||||
const float* bPtr= bVector;
|
||||
|
||||
__m128 aVal, bVal, cVal;
|
||||
for(;number < quarterPoints; number++){
|
||||
|
||||
aVal = _mm_load_ps(aPtr);
|
||||
bVal = _mm_load_ps(bPtr);
|
||||
|
||||
cVal = _mm_add_ps(aVal, bVal);
|
||||
|
||||
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
|
||||
|
||||
aPtr += 4;
|
||||
bPtr += 4;
|
||||
cPtr += 4;
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(;number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Adds the two input vectors and store their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be added
|
||||
\param bVector One of the vectors to be added
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
|
||||
float* cPtr = cVector;
|
||||
const float* aPtr = aVector;
|
||||
const float* bPtr= bVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
#ifdef LV_HAVE_ORC
|
||||
/*!
|
||||
\brief Adds the two input vectors and store their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be added
|
||||
\param bVector One of the vectors to be added
|
||||
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
|
||||
*/
|
||||
extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
|
||||
volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */
|
@ -1,127 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <float.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
/*!
|
||||
\brief Takes the conjugate of a complex vector.
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_points / 2;
|
||||
|
||||
__m128 x;
|
||||
lv_32fc_t* c = cVector;
|
||||
const lv_32fc_t* a = aVector;
|
||||
|
||||
__m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
|
||||
|
||||
for(;number < halfPoints; number++){
|
||||
|
||||
x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
|
||||
|
||||
x = _mm_xor_ps(x, conjugator); // conjugate register
|
||||
|
||||
_mm_storeu_ps((float*)c,x); // Store the results back into the C container
|
||||
|
||||
a += 2;
|
||||
c += 2;
|
||||
}
|
||||
|
||||
if((num_points % 2) != 0) {
|
||||
*c = lv_conj(*a);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Takes the conjugate of a complex vector.
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
|
||||
lv_32fc_t* cPtr = cVector;
|
||||
const lv_32fc_t* aPtr = aVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = lv_conj(*aPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <float.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
/*!
|
||||
\brief Takes the conjugate of a complex vector.
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_points / 2;
|
||||
|
||||
__m128 x;
|
||||
lv_32fc_t* c = cVector;
|
||||
const lv_32fc_t* a = aVector;
|
||||
|
||||
__m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
|
||||
|
||||
for(;number < halfPoints; number++){
|
||||
|
||||
x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
|
||||
|
||||
x = _mm_xor_ps(x, conjugator); // conjugate register
|
||||
|
||||
_mm_store_ps((float*)c,x); // Store the results back into the C container
|
||||
|
||||
a += 2;
|
||||
c += 2;
|
||||
}
|
||||
|
||||
if((num_points % 2) != 0) {
|
||||
*c = lv_conj(*a);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Takes the conjugate of a complex vector.
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector Vector to be conjugated
|
||||
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
|
||||
lv_32fc_t* cPtr = cVector;
|
||||
const lv_32fc_t* aPtr = aVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = lv_conj(*aPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */
|
@ -1,228 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
/*!
|
||||
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
|
||||
\param complexVector The vector containing the complex input values
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const float* complexVectorPtr = (float*)complexVector;
|
||||
float* magnitudeVectorPtr = magnitudeVector;
|
||||
|
||||
__m128 cplxValue1, cplxValue2, result;
|
||||
for(;number < quarterPoints; number++){
|
||||
cplxValue1 = _mm_loadu_ps(complexVectorPtr);
|
||||
complexVectorPtr += 4;
|
||||
|
||||
cplxValue2 = _mm_loadu_ps(complexVectorPtr);
|
||||
complexVectorPtr += 4;
|
||||
|
||||
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
|
||||
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
|
||||
|
||||
result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
|
||||
|
||||
_mm_storeu_ps(magnitudeVectorPtr, result);
|
||||
magnitudeVectorPtr += 4;
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(; number < num_points; number++){
|
||||
float val1Real = *complexVectorPtr++;
|
||||
float val1Imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
/*!
|
||||
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
|
||||
\param complexVector The vector containing the complex input values
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const float* complexVectorPtr = (float*)complexVector;
|
||||
float* magnitudeVectorPtr = magnitudeVector;
|
||||
|
||||
__m128 cplxValue1, cplxValue2, iValue, qValue, result;
|
||||
for(;number < quarterPoints; number++){
|
||||
cplxValue1 = _mm_loadu_ps(complexVectorPtr);
|
||||
complexVectorPtr += 4;
|
||||
|
||||
cplxValue2 = _mm_loadu_ps(complexVectorPtr);
|
||||
complexVectorPtr += 4;
|
||||
|
||||
// Arrange in i1i2i3i4 format
|
||||
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
|
||||
// Arrange in q1q2q3q4 format
|
||||
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
|
||||
|
||||
iValue = _mm_mul_ps(iValue, iValue); // Square the I values
|
||||
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
|
||||
|
||||
result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
|
||||
|
||||
_mm_storeu_ps(magnitudeVectorPtr, result);
|
||||
magnitudeVectorPtr += 4;
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(; number < num_points; number++){
|
||||
float val1Real = *complexVectorPtr++;
|
||||
float val1Imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
|
||||
\param complexVector The vector containing the complex input values
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
|
||||
const float* complexVectorPtr = (float*)complexVector;
|
||||
float* magnitudeVectorPtr = magnitudeVector;
|
||||
unsigned int number = 0;
|
||||
for(number = 0; number < num_points; number++){
|
||||
const float real = *complexVectorPtr++;
|
||||
const float imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
/*!
|
||||
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
|
||||
\param complexVector The vector containing the complex input values
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const float* complexVectorPtr = (float*)complexVector;
|
||||
float* magnitudeVectorPtr = magnitudeVector;
|
||||
|
||||
__m128 cplxValue1, cplxValue2, result;
|
||||
for(;number < quarterPoints; number++){
|
||||
cplxValue1 = _mm_load_ps(complexVectorPtr);
|
||||
complexVectorPtr += 4;
|
||||
|
||||
cplxValue2 = _mm_load_ps(complexVectorPtr);
|
||||
complexVectorPtr += 4;
|
||||
|
||||
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
|
||||
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
|
||||
|
||||
result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
|
||||
|
||||
_mm_store_ps(magnitudeVectorPtr, result);
|
||||
magnitudeVectorPtr += 4;
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(; number < num_points; number++){
|
||||
float val1Real = *complexVectorPtr++;
|
||||
float val1Imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE3 */
|
||||
|
||||
#ifdef LV_HAVE_SSE
|
||||
#include <xmmintrin.h>
|
||||
/*!
|
||||
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
|
||||
\param complexVector The vector containing the complex input values
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const float* complexVectorPtr = (float*)complexVector;
|
||||
float* magnitudeVectorPtr = magnitudeVector;
|
||||
|
||||
__m128 cplxValue1, cplxValue2, iValue, qValue, result;
|
||||
for(;number < quarterPoints; number++){
|
||||
cplxValue1 = _mm_load_ps(complexVectorPtr);
|
||||
complexVectorPtr += 4;
|
||||
|
||||
cplxValue2 = _mm_load_ps(complexVectorPtr);
|
||||
complexVectorPtr += 4;
|
||||
|
||||
// Arrange in i1i2i3i4 format
|
||||
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
|
||||
// Arrange in q1q2q3q4 format
|
||||
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
|
||||
|
||||
iValue = _mm_mul_ps(iValue, iValue); // Square the I values
|
||||
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
|
||||
|
||||
result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
|
||||
|
||||
_mm_store_ps(magnitudeVectorPtr, result);
|
||||
magnitudeVectorPtr += 4;
|
||||
}
|
||||
|
||||
number = quarterPoints * 4;
|
||||
for(; number < num_points; number++){
|
||||
float val1Real = *complexVectorPtr++;
|
||||
float val1Imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
|
||||
\param complexVector The vector containing the complex input values
|
||||
\param magnitudeVector The vector containing the real output values
|
||||
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
|
||||
const float* complexVectorPtr = (float*)complexVector;
|
||||
float* magnitudeVectorPtr = magnitudeVector;
|
||||
unsigned int number = 0;
|
||||
for(number = 0; number < num_points; number++){
|
||||
const float real = *complexVectorPtr++;
|
||||
const float imag = *complexVectorPtr++;
|
||||
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */
|
@ -99,7 +99,7 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_
|
||||
if (num_points%4!=0)
|
||||
{
|
||||
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
|
||||
_mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
|
||||
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
|
||||
|
||||
int associated_chip_index;
|
||||
float tcode_half_chips = tcode_half_chips_stored[0];
|
||||
@ -215,7 +215,7 @@ static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_
|
||||
if (num_points%4!=0)
|
||||
{
|
||||
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
|
||||
_mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
|
||||
_mm_storeu_ps ((float*)tcode_half_chips_stored, tcode_half_chips_array);
|
||||
|
||||
int associated_chip_index;
|
||||
float tcode_half_chips = tcode_half_chips_stored[0];
|
||||
|
@ -1,178 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <float.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
/*!
|
||||
\brief Multiplies the input vector by a scalar and stores the results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector The vector to be multiplied
|
||||
\param scalar The complex scalar to multiply aVector
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_points / 2;
|
||||
|
||||
__m128 x, yl, yh, z, tmp1, tmp2;
|
||||
lv_32fc_t* c = cVector;
|
||||
const lv_32fc_t* a = aVector;
|
||||
|
||||
// Set up constant scalar vector
|
||||
yl = _mm_set_ps1(lv_creal(scalar));
|
||||
yh = _mm_set_ps1(lv_cimag(scalar));
|
||||
|
||||
for(;number < halfPoints; number++){
|
||||
|
||||
x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
_mm_storeu_ps((float*)c,z); // Store the results back into the C container
|
||||
|
||||
a += 2;
|
||||
c += 2;
|
||||
}
|
||||
|
||||
if((num_points % 2) != 0) {
|
||||
*c = (*a) * scalar;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Multiplies the input vector by a scalar and stores the results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector The vector to be multiplied
|
||||
\param scalar The complex scalar to multiply aVector
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
|
||||
lv_32fc_t* cPtr = cVector;
|
||||
const lv_32fc_t* aPtr = aVector;
|
||||
unsigned int number = num_points;
|
||||
|
||||
// unwrap loop
|
||||
while (number >= 8){
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
number -= 8;
|
||||
}
|
||||
|
||||
// clean up any remaining
|
||||
while (number-- > 0)
|
||||
*cPtr++ = *aPtr++ * scalar;
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <float.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
/*!
|
||||
\brief Multiplies the two input complex vectors and stores their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be multiplied
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_points / 2;
|
||||
|
||||
__m128 x, yl, yh, z, tmp1, tmp2;
|
||||
lv_32fc_t* c = cVector;
|
||||
const lv_32fc_t* a = aVector;
|
||||
|
||||
// Set up constant scalar vector
|
||||
yl = _mm_set_ps1(lv_creal(scalar));
|
||||
yh = _mm_set_ps1(lv_cimag(scalar));
|
||||
|
||||
for(;number < halfPoints; number++){
|
||||
|
||||
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
_mm_store_ps((float*)c,z); // Store the results back into the C container
|
||||
|
||||
a += 2;
|
||||
c += 2;
|
||||
}
|
||||
|
||||
if((num_points % 2) != 0) {
|
||||
*c = (*a) * scalar;
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Multiplies the two input complex vectors and stores their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be multiplied
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
|
||||
lv_32fc_t* cPtr = cVector;
|
||||
const lv_32fc_t* aPtr = aVector;
|
||||
unsigned int number = num_points;
|
||||
|
||||
// unwrap loop
|
||||
while (number >= 8){
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
*cPtr++ = (*aPtr++) * scalar;
|
||||
number -= 8;
|
||||
}
|
||||
|
||||
// clean up any remaining
|
||||
while (number-- > 0)
|
||||
*cPtr++ = *aPtr++ * scalar;
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
|
@ -1,759 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
|
||||
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
|
||||
|
||||
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
|
||||
|
||||
float * res = (float*) result;
|
||||
float * in = (float*) input;
|
||||
float * tp = (float*) taps;
|
||||
unsigned int n_2_ccomplex_blocks = num_points/2;
|
||||
unsigned int isodd = num_points & 1;
|
||||
|
||||
float sum0[2] = {0,0};
|
||||
float sum1[2] = {0,0};
|
||||
unsigned int i = 0;
|
||||
|
||||
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
|
||||
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
|
||||
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
|
||||
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
|
||||
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
|
||||
|
||||
in += 4;
|
||||
tp += 4;
|
||||
}
|
||||
|
||||
res[0] = sum0[0] + sum1[0];
|
||||
res[1] = sum0[1] + sum1[1];
|
||||
|
||||
// Cleanup if we had an odd number of points
|
||||
for(i = 0; i < isodd; ++i) {
|
||||
*result += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
|
||||
|
||||
|
||||
#if LV_HAVE_SSE && LV_HAVE_64
|
||||
|
||||
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
|
||||
|
||||
const unsigned int num_bytes = num_points*8;
|
||||
unsigned int isodd = num_points & 1;
|
||||
|
||||
asm
|
||||
(
|
||||
"# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
|
||||
"# const float *taps, unsigned num_bytes)\n\t"
|
||||
"# float sum0 = 0;\n\t"
|
||||
"# float sum1 = 0;\n\t"
|
||||
"# float sum2 = 0;\n\t"
|
||||
"# float sum3 = 0;\n\t"
|
||||
"# do {\n\t"
|
||||
"# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
|
||||
"# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
|
||||
"# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
|
||||
"# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
|
||||
"# input += 4;\n\t"
|
||||
"# taps += 4; \n\t"
|
||||
"# } while (--n_2_ccomplex_blocks != 0);\n\t"
|
||||
"# result[0] = sum0 + sum2;\n\t"
|
||||
"# result[1] = sum1 + sum3;\n\t"
|
||||
"# TODO: prefetch and better scheduling\n\t"
|
||||
" xor %%r9, %%r9\n\t"
|
||||
" xor %%r10, %%r10\n\t"
|
||||
" movq %%rcx, %%rax\n\t"
|
||||
" movq %%rcx, %%r8\n\t"
|
||||
" movq %[rsi], %%r9\n\t"
|
||||
" movq %[rdx], %%r10\n\t"
|
||||
" xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
|
||||
" movups 0(%%r9), %%xmm0\n\t"
|
||||
" xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
|
||||
" movups 0(%%r10), %%xmm2\n\t"
|
||||
" shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
|
||||
" shr $4, %%r8\n\t"
|
||||
" jmp .%=L1_test\n\t"
|
||||
" # 4 taps / loop\n\t"
|
||||
" # something like ?? cycles / loop\n\t"
|
||||
".%=Loop1: \n\t"
|
||||
"# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
|
||||
"# movups (%%r9), %%xmmA\n\t"
|
||||
"# movups (%%r10), %%xmmB\n\t"
|
||||
"# movups %%xmmA, %%xmmZ\n\t"
|
||||
"# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
|
||||
"# mulps %%xmmB, %%xmmA\n\t"
|
||||
"# mulps %%xmmZ, %%xmmB\n\t"
|
||||
"# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
|
||||
"# xorps %%xmmPN, %%xmmA\n\t"
|
||||
"# movups %%xmmA, %%xmmZ\n\t"
|
||||
"# unpcklps %%xmmB, %%xmmA\n\t"
|
||||
"# unpckhps %%xmmB, %%xmmZ\n\t"
|
||||
"# movups %%xmmZ, %%xmmY\n\t"
|
||||
"# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
|
||||
"# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
|
||||
"# addps %%xmmZ, %%xmmA\n\t"
|
||||
"# addps %%xmmA, %%xmmC\n\t"
|
||||
"# A=xmm0, B=xmm2, Z=xmm4\n\t"
|
||||
"# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
|
||||
" movups 16(%%r9), %%xmm1\n\t"
|
||||
" movups %%xmm0, %%xmm4\n\t"
|
||||
" mulps %%xmm2, %%xmm0\n\t"
|
||||
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
|
||||
" movups 16(%%r10), %%xmm3\n\t"
|
||||
" movups %%xmm1, %%xmm5\n\t"
|
||||
" addps %%xmm0, %%xmm6\n\t"
|
||||
" mulps %%xmm3, %%xmm1\n\t"
|
||||
" shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
|
||||
" addps %%xmm1, %%xmm6\n\t"
|
||||
" mulps %%xmm4, %%xmm2\n\t"
|
||||
" movups 32(%%r9), %%xmm0\n\t"
|
||||
" addps %%xmm2, %%xmm7\n\t"
|
||||
" mulps %%xmm5, %%xmm3\n\t"
|
||||
" add $32, %%r9\n\t"
|
||||
" movups 32(%%r10), %%xmm2\n\t"
|
||||
" addps %%xmm3, %%xmm7\n\t"
|
||||
" add $32, %%r10\n\t"
|
||||
".%=L1_test:\n\t"
|
||||
" dec %%rax\n\t"
|
||||
" jge .%=Loop1\n\t"
|
||||
" # We've handled the bulk of multiplies up to here.\n\t"
|
||||
" # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
|
||||
" # If so, we've got 2 more taps to do.\n\t"
|
||||
" and $1, %%r8\n\t"
|
||||
" je .%=Leven\n\t"
|
||||
" # The count was odd, do 2 more taps.\n\t"
|
||||
" # Note that we've already got mm0/mm2 preloaded\n\t"
|
||||
" # from the main loop.\n\t"
|
||||
" movups %%xmm0, %%xmm4\n\t"
|
||||
" mulps %%xmm2, %%xmm0\n\t"
|
||||
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
|
||||
" addps %%xmm0, %%xmm6\n\t"
|
||||
" mulps %%xmm4, %%xmm2\n\t"
|
||||
" addps %%xmm2, %%xmm7\n\t"
|
||||
".%=Leven:\n\t"
|
||||
" # neg inversor\n\t"
|
||||
" xorps %%xmm1, %%xmm1\n\t"
|
||||
" mov $0x80000000, %%r9\n\t"
|
||||
" movd %%r9, %%xmm1\n\t"
|
||||
" shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
|
||||
" # pfpnacc\n\t"
|
||||
" xorps %%xmm1, %%xmm6\n\t"
|
||||
" movups %%xmm6, %%xmm2\n\t"
|
||||
" unpcklps %%xmm7, %%xmm6\n\t"
|
||||
" unpckhps %%xmm7, %%xmm2\n\t"
|
||||
" movups %%xmm2, %%xmm3\n\t"
|
||||
" shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
|
||||
" shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
|
||||
" addps %%xmm2, %%xmm6\n\t"
|
||||
" # xmm6 = r1 i2 r3 i4\n\t"
|
||||
" movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
|
||||
" addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
|
||||
" movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
|
||||
:
|
||||
:[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
|
||||
:"rax", "r8", "r9", "r10"
|
||||
);
|
||||
|
||||
|
||||
if(isodd) {
|
||||
*result += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_SSE && LV_HAVE_64 */
|
||||
|
||||
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
|
||||
|
||||
lv_32fc_t dotProduct;
|
||||
memset(&dotProduct, 0x0, 2*sizeof(float));
|
||||
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_points/2;
|
||||
unsigned int isodd = num_points & 1;
|
||||
|
||||
__m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
|
||||
|
||||
const lv_32fc_t* a = input;
|
||||
const lv_32fc_t* b = taps;
|
||||
|
||||
dotProdVal = _mm_setzero_ps();
|
||||
|
||||
for(;number < halfPoints; number++){
|
||||
|
||||
x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
|
||||
|
||||
a += 2;
|
||||
b += 2;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
|
||||
|
||||
_mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
|
||||
|
||||
dotProduct += ( dotProductVector[0] + dotProductVector[1] );
|
||||
|
||||
if(isodd) {
|
||||
dotProduct += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
|
||||
*result = dotProduct;
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE3*/
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
|
||||
|
||||
unsigned int i = 0;
|
||||
const unsigned int qtr_points = num_points/4;
|
||||
const unsigned int isodd = num_points & 3;
|
||||
|
||||
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
|
||||
float *p_input, *p_taps;
|
||||
__m64 *p_result;
|
||||
|
||||
p_result = (__m64*)result;
|
||||
p_input = (float*)input;
|
||||
p_taps = (float*)taps;
|
||||
|
||||
static const __m128i neg = {0x000000000000000080000000};
|
||||
|
||||
real0 = _mm_setzero_ps();
|
||||
real1 = _mm_setzero_ps();
|
||||
im0 = _mm_setzero_ps();
|
||||
im1 = _mm_setzero_ps();
|
||||
|
||||
for(; i < qtr_points; ++i) {
|
||||
xmm0 = _mm_loadu_ps(p_input);
|
||||
xmm1 = _mm_loadu_ps(p_taps);
|
||||
|
||||
p_input += 4;
|
||||
p_taps += 4;
|
||||
|
||||
xmm2 = _mm_loadu_ps(p_input);
|
||||
xmm3 = _mm_loadu_ps(p_taps);
|
||||
|
||||
p_input += 4;
|
||||
p_taps += 4;
|
||||
|
||||
xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
|
||||
xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
|
||||
xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
|
||||
xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
|
||||
|
||||
//imaginary vector from input
|
||||
xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
|
||||
//real vector from input
|
||||
xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
|
||||
//imaginary vector from taps
|
||||
xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
|
||||
//real vector from taps
|
||||
xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
|
||||
|
||||
xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
|
||||
xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
|
||||
|
||||
xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
|
||||
xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
|
||||
|
||||
real0 = _mm_add_ps(xmm4, real0);
|
||||
real1 = _mm_add_ps(xmm5, real1);
|
||||
im0 = _mm_add_ps(xmm6, im0);
|
||||
im1 = _mm_add_ps(xmm7, im1);
|
||||
}
|
||||
|
||||
real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
|
||||
|
||||
im0 = _mm_add_ps(im0, im1);
|
||||
real0 = _mm_add_ps(real0, real1);
|
||||
|
||||
im0 = _mm_add_ps(im0, real0);
|
||||
|
||||
_mm_storel_pi(p_result, im0);
|
||||
|
||||
for(i = num_points-isodd; i < num_points; i++) {
|
||||
*result += input[i] * taps[i];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE4_1*/
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
|
||||
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
|
||||
|
||||
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
|
||||
|
||||
const unsigned int num_bytes = num_points*8;
|
||||
|
||||
float * res = (float*) result;
|
||||
float * in = (float*) input;
|
||||
float * tp = (float*) taps;
|
||||
unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
|
||||
unsigned int isodd = num_points & 1;
|
||||
|
||||
float sum0[2] = {0,0};
|
||||
float sum1[2] = {0,0};
|
||||
unsigned int i = 0;
|
||||
|
||||
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
|
||||
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
|
||||
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
|
||||
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
|
||||
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
|
||||
|
||||
in += 4;
|
||||
tp += 4;
|
||||
}
|
||||
|
||||
res[0] = sum0[0] + sum1[0];
|
||||
res[1] = sum0[1] + sum1[1];
|
||||
|
||||
for(i = 0; i < isodd; ++i) {
|
||||
*result += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
|
||||
|
||||
#if LV_HAVE_SSE && LV_HAVE_64
|
||||
|
||||
|
||||
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
|
||||
|
||||
const unsigned int num_bytes = num_points*8;
|
||||
unsigned int isodd = num_points & 1;
|
||||
|
||||
asm
|
||||
(
|
||||
"# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
|
||||
"# const float *taps, unsigned num_bytes)\n\t"
|
||||
"# float sum0 = 0;\n\t"
|
||||
"# float sum1 = 0;\n\t"
|
||||
"# float sum2 = 0;\n\t"
|
||||
"# float sum3 = 0;\n\t"
|
||||
"# do {\n\t"
|
||||
"# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
|
||||
"# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
|
||||
"# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
|
||||
"# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
|
||||
"# input += 4;\n\t"
|
||||
"# taps += 4; \n\t"
|
||||
"# } while (--n_2_ccomplex_blocks != 0);\n\t"
|
||||
"# result[0] = sum0 + sum2;\n\t"
|
||||
"# result[1] = sum1 + sum3;\n\t"
|
||||
"# TODO: prefetch and better scheduling\n\t"
|
||||
" xor %%r9, %%r9\n\t"
|
||||
" xor %%r10, %%r10\n\t"
|
||||
" movq %%rcx, %%rax\n\t"
|
||||
" movq %%rcx, %%r8\n\t"
|
||||
" movq %[rsi], %%r9\n\t"
|
||||
" movq %[rdx], %%r10\n\t"
|
||||
" xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
|
||||
" movaps 0(%%r9), %%xmm0\n\t"
|
||||
" xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
|
||||
" movaps 0(%%r10), %%xmm2\n\t"
|
||||
" shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
|
||||
" shr $4, %%r8\n\t"
|
||||
" jmp .%=L1_test\n\t"
|
||||
" # 4 taps / loop\n\t"
|
||||
" # something like ?? cycles / loop\n\t"
|
||||
".%=Loop1: \n\t"
|
||||
"# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
|
||||
"# movaps (%%r9), %%xmmA\n\t"
|
||||
"# movaps (%%r10), %%xmmB\n\t"
|
||||
"# movaps %%xmmA, %%xmmZ\n\t"
|
||||
"# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
|
||||
"# mulps %%xmmB, %%xmmA\n\t"
|
||||
"# mulps %%xmmZ, %%xmmB\n\t"
|
||||
"# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
|
||||
"# xorps %%xmmPN, %%xmmA\n\t"
|
||||
"# movaps %%xmmA, %%xmmZ\n\t"
|
||||
"# unpcklps %%xmmB, %%xmmA\n\t"
|
||||
"# unpckhps %%xmmB, %%xmmZ\n\t"
|
||||
"# movaps %%xmmZ, %%xmmY\n\t"
|
||||
"# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
|
||||
"# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
|
||||
"# addps %%xmmZ, %%xmmA\n\t"
|
||||
"# addps %%xmmA, %%xmmC\n\t"
|
||||
"# A=xmm0, B=xmm2, Z=xmm4\n\t"
|
||||
"# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
|
||||
" movaps 16(%%r9), %%xmm1\n\t"
|
||||
" movaps %%xmm0, %%xmm4\n\t"
|
||||
" mulps %%xmm2, %%xmm0\n\t"
|
||||
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
|
||||
" movaps 16(%%r10), %%xmm3\n\t"
|
||||
" movaps %%xmm1, %%xmm5\n\t"
|
||||
" addps %%xmm0, %%xmm6\n\t"
|
||||
" mulps %%xmm3, %%xmm1\n\t"
|
||||
" shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
|
||||
" addps %%xmm1, %%xmm6\n\t"
|
||||
" mulps %%xmm4, %%xmm2\n\t"
|
||||
" movaps 32(%%r9), %%xmm0\n\t"
|
||||
" addps %%xmm2, %%xmm7\n\t"
|
||||
" mulps %%xmm5, %%xmm3\n\t"
|
||||
" add $32, %%r9\n\t"
|
||||
" movaps 32(%%r10), %%xmm2\n\t"
|
||||
" addps %%xmm3, %%xmm7\n\t"
|
||||
" add $32, %%r10\n\t"
|
||||
".%=L1_test:\n\t"
|
||||
" dec %%rax\n\t"
|
||||
" jge .%=Loop1\n\t"
|
||||
" # We've handled the bulk of multiplies up to here.\n\t"
|
||||
" # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
|
||||
" # If so, we've got 2 more taps to do.\n\t"
|
||||
" and $1, %%r8\n\t"
|
||||
" je .%=Leven\n\t"
|
||||
" # The count was odd, do 2 more taps.\n\t"
|
||||
" # Note that we've already got mm0/mm2 preloaded\n\t"
|
||||
" # from the main loop.\n\t"
|
||||
" movaps %%xmm0, %%xmm4\n\t"
|
||||
" mulps %%xmm2, %%xmm0\n\t"
|
||||
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
|
||||
" addps %%xmm0, %%xmm6\n\t"
|
||||
" mulps %%xmm4, %%xmm2\n\t"
|
||||
" addps %%xmm2, %%xmm7\n\t"
|
||||
".%=Leven:\n\t"
|
||||
" # neg inversor\n\t"
|
||||
" xorps %%xmm1, %%xmm1\n\t"
|
||||
" mov $0x80000000, %%r9\n\t"
|
||||
" movd %%r9, %%xmm1\n\t"
|
||||
" shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
|
||||
" # pfpnacc\n\t"
|
||||
" xorps %%xmm1, %%xmm6\n\t"
|
||||
" movaps %%xmm6, %%xmm2\n\t"
|
||||
" unpcklps %%xmm7, %%xmm6\n\t"
|
||||
" unpckhps %%xmm7, %%xmm2\n\t"
|
||||
" movaps %%xmm2, %%xmm3\n\t"
|
||||
" shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
|
||||
" shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
|
||||
" addps %%xmm2, %%xmm6\n\t"
|
||||
" # xmm6 = r1 i2 r3 i4\n\t"
|
||||
" movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
|
||||
" addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
|
||||
" movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
|
||||
:
|
||||
:[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
|
||||
:"rax", "r8", "r9", "r10"
|
||||
);
|
||||
|
||||
|
||||
if(isodd) {
|
||||
*result += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if LV_HAVE_SSE && LV_HAVE_32
|
||||
|
||||
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
|
||||
|
||||
volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
|
||||
|
||||
#if 0
|
||||
const unsigned int num_bytes = num_points*8;
|
||||
unsigned int isodd = num_points & 1;
|
||||
|
||||
asm volatile
|
||||
(
|
||||
" #pushl %%ebp\n\t"
|
||||
" #movl %%esp, %%ebp\n\t"
|
||||
" movl 12(%%ebp), %%eax # input\n\t"
|
||||
" movl 16(%%ebp), %%edx # taps\n\t"
|
||||
" movl 20(%%ebp), %%ecx # n_bytes\n\t"
|
||||
" xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
|
||||
" movaps 0(%%eax), %%xmm0\n\t"
|
||||
" xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
|
||||
" movaps 0(%%edx), %%xmm2\n\t"
|
||||
" shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
|
||||
" jmp .%=L1_test\n\t"
|
||||
" # 4 taps / loop\n\t"
|
||||
" # something like ?? cycles / loop\n\t"
|
||||
".%=Loop1: \n\t"
|
||||
"# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
|
||||
"# movaps (%%eax), %%xmmA\n\t"
|
||||
"# movaps (%%edx), %%xmmB\n\t"
|
||||
"# movaps %%xmmA, %%xmmZ\n\t"
|
||||
"# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
|
||||
"# mulps %%xmmB, %%xmmA\n\t"
|
||||
"# mulps %%xmmZ, %%xmmB\n\t"
|
||||
"# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
|
||||
"# xorps %%xmmPN, %%xmmA\n\t"
|
||||
"# movaps %%xmmA, %%xmmZ\n\t"
|
||||
"# unpcklps %%xmmB, %%xmmA\n\t"
|
||||
"# unpckhps %%xmmB, %%xmmZ\n\t"
|
||||
"# movaps %%xmmZ, %%xmmY\n\t"
|
||||
"# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
|
||||
"# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
|
||||
"# addps %%xmmZ, %%xmmA\n\t"
|
||||
"# addps %%xmmA, %%xmmC\n\t"
|
||||
"# A=xmm0, B=xmm2, Z=xmm4\n\t"
|
||||
"# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
|
||||
" movaps 16(%%eax), %%xmm1\n\t"
|
||||
" movaps %%xmm0, %%xmm4\n\t"
|
||||
" mulps %%xmm2, %%xmm0\n\t"
|
||||
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
|
||||
" movaps 16(%%edx), %%xmm3\n\t"
|
||||
" movaps %%xmm1, %%xmm5\n\t"
|
||||
" addps %%xmm0, %%xmm6\n\t"
|
||||
" mulps %%xmm3, %%xmm1\n\t"
|
||||
" shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
|
||||
" addps %%xmm1, %%xmm6\n\t"
|
||||
" mulps %%xmm4, %%xmm2\n\t"
|
||||
" movaps 32(%%eax), %%xmm0\n\t"
|
||||
" addps %%xmm2, %%xmm7\n\t"
|
||||
" mulps %%xmm5, %%xmm3\n\t"
|
||||
" addl $32, %%eax\n\t"
|
||||
" movaps 32(%%edx), %%xmm2\n\t"
|
||||
" addps %%xmm3, %%xmm7\n\t"
|
||||
" addl $32, %%edx\n\t"
|
||||
".%=L1_test:\n\t"
|
||||
" decl %%ecx\n\t"
|
||||
" jge .%=Loop1\n\t"
|
||||
" # We've handled the bulk of multiplies up to here.\n\t"
|
||||
" # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
|
||||
" # If so, we've got 2 more taps to do.\n\t"
|
||||
" movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
|
||||
" shrl $4, %%ecx\n\t"
|
||||
" andl $1, %%ecx\n\t"
|
||||
" je .%=Leven\n\t"
|
||||
" # The count was odd, do 2 more taps.\n\t"
|
||||
" # Note that we've already got mm0/mm2 preloaded\n\t"
|
||||
" # from the main loop.\n\t"
|
||||
" movaps %%xmm0, %%xmm4\n\t"
|
||||
" mulps %%xmm2, %%xmm0\n\t"
|
||||
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
|
||||
" addps %%xmm0, %%xmm6\n\t"
|
||||
" mulps %%xmm4, %%xmm2\n\t"
|
||||
" addps %%xmm2, %%xmm7\n\t"
|
||||
".%=Leven:\n\t"
|
||||
" # neg inversor\n\t"
|
||||
" movl 8(%%ebp), %%eax \n\t"
|
||||
" xorps %%xmm1, %%xmm1\n\t"
|
||||
" movl $0x80000000, (%%eax)\n\t"
|
||||
" movss (%%eax), %%xmm1\n\t"
|
||||
" shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
|
||||
" # pfpnacc\n\t"
|
||||
" xorps %%xmm1, %%xmm6\n\t"
|
||||
" movaps %%xmm6, %%xmm2\n\t"
|
||||
" unpcklps %%xmm7, %%xmm6\n\t"
|
||||
" unpckhps %%xmm7, %%xmm2\n\t"
|
||||
" movaps %%xmm2, %%xmm3\n\t"
|
||||
" shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
|
||||
" shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
|
||||
" addps %%xmm2, %%xmm6\n\t"
|
||||
" # xmm6 = r1 i2 r3 i4\n\t"
|
||||
" #movl 8(%%ebp), %%eax # @result\n\t"
|
||||
" movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
|
||||
" addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
|
||||
" movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
|
||||
" #popl %%ebp\n\t"
|
||||
:
|
||||
:
|
||||
: "eax", "ecx", "edx"
|
||||
);
|
||||
|
||||
|
||||
int getem = num_bytes % 16;
|
||||
|
||||
if(isodd) {
|
||||
*result += (input[num_points - 1] * taps[num_points - 1]);
|
||||
}
|
||||
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE*/
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
|
||||
|
||||
const unsigned int num_bytes = num_points*8;
|
||||
unsigned int isodd = num_points & 1;
|
||||
|
||||
lv_32fc_t dotProduct;
|
||||
memset(&dotProduct, 0x0, 2*sizeof(float));
|
||||
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_bytes >> 4;
|
||||
|
||||
__m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
|
||||
|
||||
const lv_32fc_t* a = input;
|
||||
const lv_32fc_t* b = taps;
|
||||
|
||||
dotProdVal = _mm_setzero_ps();
|
||||
|
||||
for(;number < halfPoints; number++){
|
||||
|
||||
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
|
||||
|
||||
a += 2;
|
||||
b += 2;
|
||||
}
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
|
||||
|
||||
_mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
|
||||
|
||||
dotProduct += ( dotProductVector[0] + dotProductVector[1] );
|
||||
|
||||
if(isodd) {
|
||||
dotProduct += input[num_points - 1] * taps[num_points - 1];
|
||||
}
|
||||
|
||||
*result = dotProduct;
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE3*/
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
|
||||
|
||||
unsigned int i = 0;
|
||||
const unsigned int qtr_points = num_points/4;
|
||||
const unsigned int isodd = num_points & 3;
|
||||
|
||||
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
|
||||
float *p_input, *p_taps;
|
||||
__m64 *p_result;
|
||||
|
||||
static const __m128i neg = {0x000000000000000080000000};
|
||||
|
||||
p_result = (__m64*)result;
|
||||
p_input = (float*)input;
|
||||
p_taps = (float*)taps;
|
||||
|
||||
real0 = _mm_setzero_ps();
|
||||
real1 = _mm_setzero_ps();
|
||||
im0 = _mm_setzero_ps();
|
||||
im1 = _mm_setzero_ps();
|
||||
|
||||
for(; i < qtr_points; ++i) {
|
||||
xmm0 = _mm_load_ps(p_input);
|
||||
xmm1 = _mm_load_ps(p_taps);
|
||||
|
||||
p_input += 4;
|
||||
p_taps += 4;
|
||||
|
||||
xmm2 = _mm_load_ps(p_input);
|
||||
xmm3 = _mm_load_ps(p_taps);
|
||||
|
||||
p_input += 4;
|
||||
p_taps += 4;
|
||||
|
||||
xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
|
||||
xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
|
||||
xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
|
||||
xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
|
||||
|
||||
//imaginary vector from input
|
||||
xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
|
||||
//real vector from input
|
||||
xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
|
||||
//imaginary vector from taps
|
||||
xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
|
||||
//real vector from taps
|
||||
xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
|
||||
|
||||
xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
|
||||
xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
|
||||
|
||||
xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
|
||||
xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
|
||||
|
||||
real0 = _mm_add_ps(xmm4, real0);
|
||||
real1 = _mm_add_ps(xmm5, real1);
|
||||
im0 = _mm_add_ps(xmm6, im0);
|
||||
im1 = _mm_add_ps(xmm7, im1);
|
||||
}
|
||||
|
||||
real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
|
||||
|
||||
im0 = _mm_add_ps(im0, im1);
|
||||
real0 = _mm_add_ps(real0, real1);
|
||||
|
||||
im0 = _mm_add_ps(im0, real0);
|
||||
|
||||
_mm_storel_pi(p_result, im0);
|
||||
|
||||
for(i = num_points-isodd; i < num_points; i++) {
|
||||
*result += input[i] * taps[i];
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_SSE4_1*/
|
||||
|
||||
#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/
|
@ -1,170 +0,0 @@
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <float.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
/*!
|
||||
\brief Multiplies the two input complex vectors and stores their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be multiplied
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_points / 2;
|
||||
|
||||
__m128 x, y, yl, yh, z, tmp1, tmp2;
|
||||
lv_32fc_t* c = cVector;
|
||||
const lv_32fc_t* a = aVector;
|
||||
const lv_32fc_t* b = bVector;
|
||||
|
||||
for(;number < halfPoints; number++){
|
||||
|
||||
x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
_mm_storeu_ps((float*)c,z); // Store the results back into the C container
|
||||
|
||||
a += 2;
|
||||
b += 2;
|
||||
c += 2;
|
||||
}
|
||||
|
||||
if((num_points % 2) != 0) {
|
||||
*c = (*a) * (*b);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Multiplies the two input complex vectors and stores their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be multiplied
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
|
||||
lv_32fc_t* cPtr = cVector;
|
||||
const lv_32fc_t* aPtr = aVector;
|
||||
const lv_32fc_t* bPtr= bVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
|
||||
#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
|
||||
#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
|
||||
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <float.h>
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
/*!
|
||||
\brief Multiplies the two input complex vectors and stores their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be multiplied
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
|
||||
unsigned int number = 0;
|
||||
const unsigned int halfPoints = num_points / 2;
|
||||
|
||||
__m128 x, y, yl, yh, z, tmp1, tmp2;
|
||||
lv_32fc_t* c = cVector;
|
||||
const lv_32fc_t* a = aVector;
|
||||
const lv_32fc_t* b = bVector;
|
||||
for(;number < halfPoints; number++){
|
||||
|
||||
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||
y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||
|
||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||
|
||||
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||
|
||||
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||
|
||||
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||
|
||||
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||
|
||||
_mm_store_ps((float*)c,z); // Store the results back into the C container
|
||||
|
||||
a += 2;
|
||||
b += 2;
|
||||
c += 2;
|
||||
}
|
||||
|
||||
if((num_points % 2) != 0) {
|
||||
*c = (*a) * (*b);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_SSE */
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
/*!
|
||||
\brief Multiplies the two input complex vectors and stores their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be multiplied
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
|
||||
lv_32fc_t* cPtr = cVector;
|
||||
const lv_32fc_t* aPtr = aVector;
|
||||
const lv_32fc_t* bPtr= bVector;
|
||||
unsigned int number = 0;
|
||||
|
||||
for(number = 0; number < num_points; number++){
|
||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||
}
|
||||
}
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
#ifdef LV_HAVE_ORC
|
||||
/*!
|
||||
\brief Multiplies the two input complex vectors and stores their results in the third vector
|
||||
\param cVector The vector where the results will be stored
|
||||
\param aVector One of the vectors to be multiplied
|
||||
\param bVector One of the vectors to be multiplied
|
||||
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
|
||||
*/
|
||||
extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
|
||||
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
|
||||
volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
|
||||
}
|
||||
#endif /* LV_HAVE_ORC */
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */
|
@ -1,3 +1,49 @@
|
||||
/*!
|
||||
* \file volk_gnsssdr_32fc_x5_cw_vepl_corr_32fc_x5
|
||||
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt and Late correlation with 64 bits vectors
|
||||
* \authors <ul>
|
||||
* <li>Javier Arribas, 2011. jarribas(at)cttc.es
|
||||
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
|
||||
* </ul>
|
||||
*
|
||||
* Volk protokernel that performs the carrier wipe-off mixing and the
|
||||
* Early, Prompt and Late correlation with 64 bits vectors (32 bits the
|
||||
* real part and 32 bits the imaginary part):
|
||||
* - The carrier wipe-off is done by multiplying the input signal by the
|
||||
* carrier (multiplication of 64 bits vectors) It returns the input
|
||||
* signal in base band (BB)
|
||||
* - Early values are calculated by multiplying the input signal in BB by the
|
||||
* early code (multiplication of 64 bits vectors), accumulating the results
|
||||
* - Prompt values are calculated by multiplying the input signal in BB by the
|
||||
* prompt code (multiplication of 64 bits vectors), accumulating the results
|
||||
* - Late values are calculated by multiplying the input signal in BB by the
|
||||
* late code (multiplication of 64 bits vectors), accumulating the results
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
|
||||
*
|
||||
* GNSS-SDR is a software defined Global Navigation
|
||||
* Satellite Systems receiver
|
||||
*
|
||||
* This file is part of GNSS-SDR.
|
||||
*
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
|
||||
#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
|
||||
|
||||
|
@ -24,17 +24,6 @@
|
||||
#include <volk_gnsssdr/volk_gnsssdr.h>
|
||||
#include <boost/test/unit_test.hpp>
|
||||
|
||||
//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1);
|
||||
|
||||
//GNSS-SDR PROTO-KERNELS
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1);
|
||||
@ -52,7 +41,6 @@ VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1);
|
||||
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
|
||||
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1);
|
||||
|
@ -244,7 +244,6 @@ void galileo_volk_e1_dll_pll_veml_tracking_cc::update_local_code()
|
||||
{
|
||||
double tcode_half_chips;
|
||||
float rem_code_phase_half_chips;
|
||||
int associated_chip_index;
|
||||
int code_length_half_chips = static_cast<int>(Galileo_E1_B_CODE_LENGTH_CHIPS) * 2;
|
||||
double code_phase_step_chips;
|
||||
double code_phase_step_half_chips;
|
||||
|
Loading…
x
Reference in New Issue
Block a user