1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-01-29 02:14:51 +00:00

Merge branch 'next' of https://github.com/gnss-sdr/gnss-sdr into next

This commit is contained in:
Carles Fernandez 2017-08-21 12:06:03 +02:00
commit 265caeda33
10 changed files with 1001 additions and 5 deletions

View File

@ -164,6 +164,58 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 8;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux;
unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2;
__m256i intInputVal1, intInputVal2;
__m256 ret1, ret2;
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm256_cvtps_epi32(ret1);
intInputVal2 = _mm256_cvtps_epi32(ret2);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
_mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 16;
}
for(i = avx2_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
aux = max_val;
else if(aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
@ -269,6 +321,59 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 8;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux;
unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2;
__m256i intInputVal1, intInputVal2;
__m256 ret1, ret2;
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm256_cvtps_epi32(ret1);
intInputVal2 = _mm256_cvtps_epi32(ret2);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
_mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 16;
}
for(i = avx2_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
aux = max_val;
else if(aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>

View File

@ -85,6 +85,146 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 16;
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
const float min_val = (float)SCHAR_MIN;
const float max_val = (float)SCHAR_MAX;
float aux;
unsigned int i;
__m256 inputVal1, inputVal2, inputVal3, inputVal4;
__m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
__m256i int8InputVal;
__m256 ret1, ret2, ret3, ret4;
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
// Clip
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm256_cvtps_epi32(ret1);
intInputVal2 = _mm256_cvtps_epi32(ret2);
intInputVal3 = _mm256_cvtps_epi32(ret3);
intInputVal4 = _mm256_cvtps_epi32(ret4);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
_mm256_storeu_si256((__m256i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 32;
}
for(i = avx2_iters * 32; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
aux = max_val;
else if(aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 16;
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
const float min_val = (float)SCHAR_MIN;
const float max_val = (float)SCHAR_MAX;
float aux;
unsigned int i;
__m256 inputVal1, inputVal2, inputVal3, inputVal4;
__m256i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
__m256i int8InputVal;
__m256 ret1, ret2, ret3, ret4;
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm256_mul_ps(inputVal2, vmax_val);
inputVal3 = _mm256_mul_ps(inputVal3, vmax_val);
inputVal4 = _mm256_mul_ps(inputVal4, vmax_val);
// Clip
ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm256_max_ps(_mm256_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm256_max_ps(_mm256_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm256_cvtps_epi32(ret1);
intInputVal2 = _mm256_cvtps_epi32(ret2);
intInputVal3 = _mm256_cvtps_epi32(ret3);
intInputVal4 = _mm256_cvtps_epi32(ret4);
intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
intInputVal2 = _mm256_packs_epi32(intInputVal3, intInputVal4);
intInputVal2 = _mm256_permute4x64_epi64(intInputVal2, 0b11011000);
int8InputVal = _mm256_packs_epi16(intInputVal1, intInputVal2);
int8InputVal = _mm256_permute4x64_epi64(int8InputVal, 0b11011000);
_mm256_store_si256((__m256i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 32;
}
for(i = avx2_iters * 32; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
aux = max_val;
else if(aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>

View File

@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
@ -248,6 +248,64 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
#endif
#ifdef LV_HAVE_AVX2
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
float code_phase_step_chips = -0.6;
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
#ifdef LV_HAVE_AVX2
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
float code_phase_step_chips = -0.6;
int code_length_chips = 1023;
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{

View File

@ -160,7 +160,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
}
}
#endif
#endif
#ifdef LV_HAVE_SSE3
@ -295,7 +295,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
}
}
#endif
#endif
#ifdef LV_HAVE_SSE4_1
@ -518,6 +518,162 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
#endif
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{
lv_32fc_t** _result = result;
const unsigned int avx_iters = num_points / 8;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m256 eights = _mm256_set1_ps(8.0f);
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
__m256i local_code_chip_index_reg, i;
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
//aux = _mm256_add_ps(aux, aux2);
// floor
aux = _mm256_floor_ps(aux);
// fmod
c = _mm256_div_ps(aux, code_length_chips_reg_f);
//_mm_fmsub_ps(c, code_length_chips_reg_f, aux)
i = _mm256_cvttps_epi32(c);
cTrunc = _mm256_cvtepi32_ps(i);
base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
local_code_chip_index_reg = _mm256_cvttps_epi32(base);
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
}
}
#endif
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{
lv_32fc_t** _result = result;
const unsigned int avx_iters = num_points / 8;
int current_correlator_tap;
unsigned int n;
unsigned int k;
const __m256 eights = _mm256_set1_ps(8.0f);
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
__m256i local_code_chip_index_reg, i;
__m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn;
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2);
//aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
//aux = _mm256_add_ps(aux, aux2);
// floor
aux = _mm256_floor_ps(aux);
// fmod
c = _mm256_div_ps(aux, code_length_chips_reg_f);
i = _mm256_cvttps_epi32(c);
cTrunc = _mm256_cvtepi32_ps(i);
base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux);
local_code_chip_index_reg = _mm256_cvttps_epi32(base);
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm256_add_ps(indexn, eights);
}
}
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
}
}
#endif
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
@ -604,4 +760,3 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
#endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/

View File

@ -152,6 +152,83 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const char* inputBuffer, unsigned int num_points)
{
char returnValue = 0;
const unsigned int sse_iters = num_points / 32;
unsigned int number;
unsigned int i;
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256();
for(number = 0; number < sse_iters; number++)
{
aVal = _mm256_load_si256((__m256i*)aPtr);
accumulator = _mm256_add_epi8(accumulator, aVal);
aPtr += 32;
}
_mm256_store_si256((__m256i*)tempBuffer,accumulator);
for(i = 0; i < 32; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 32); ++i)
{
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_AVX2
#include <pmmintrin.h>
static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const char* inputBuffer, unsigned int num_points)
{
char returnValue = 0;
const unsigned int sse_iters = num_points / 32;
unsigned int number;
unsigned int i;
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256();
for(number = 0; number < sse_iters; number++)
{
aVal = _mm256_lddqu_si256((__m256i*)aPtr);
accumulator = _mm256_add_epi8(accumulator, aVal);
aPtr += 32;
}
_mm256_storeu_si256((__m256i*)tempBuffer, accumulator);
for(i = 0; i < 32; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 32); ++i)
{
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_ORC
extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
@ -169,4 +246,3 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const cha
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_H */

View File

@ -58,6 +58,71 @@
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#ifdef LV_HAVE_AVX2
#include<immintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx2_iters; number++)
{
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
mask = _mm256_movemask_epi8(compareResults);
if (mask != 0xFFFFFFFF)
{
_mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
mask = ~mask;
i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm256_set1_epi8(max);
}
inputPtr += 32;
}
for(i = 0; i<(num_points % 32); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_AVX2*/
#ifdef LV_HAVE_AVX
#include <immintrin.h>
@ -271,6 +336,70 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_AVX2
#include<immintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx2_iters; number++)
{
currentValues = _mm256_load_si256((__m256i*)inputPtr);
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
mask = _mm256_movemask_epi8(compareResults);
if (mask != 0xFFFFFFFF)
{
_mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
mask = ~mask;
i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm256_set1_epi8(max);
}
inputPtr += 32;
}
for(i = 0; i<(num_points % 32); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_AVX2*/
#ifdef LV_HAVE_AVX
#include <immintrin.h>

View File

@ -58,6 +58,55 @@
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx_iters; number++)
{
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
compareResults = _mm256_max_epi8(maxValues, currentValues);
maxValues = compareResults;
inputPtr += 32;
}
_mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues);
for(i = 0; i < 32; ++i)
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(i = avx_iters * 32; i < num_points; ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target[0] = max;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
@ -238,6 +287,55 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx_iters; number++)
{
currentValues = _mm256_load_si256((__m256i*)inputPtr);
compareResults = _mm256_max_epi8(maxValues, currentValues);
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 32;
}
_mm256_store_si256((__m256i*)maxValuesBuffer, maxValues);
for(i = 0; i < 32; ++i)
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(i = avx_iters * 32; i < num_points; ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target[0] = max;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>

View File

@ -94,6 +94,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr = bVector;
__m256i aVal, bVal, cVal;
for(number = 0; number < avx_iters; number++)
{
aVal = _mm256_loadu_si256((__m256i*)aPtr);
bVal = _mm256_loadu_si256((__m256i*)bPtr);
cVal = _mm256_add_epi8(aVal, bVal);
_mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
aPtr += 32;
bPtr += 32;
cPtr += 32;
}
for(i = avx_iters * 32; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
@ -147,6 +183,42 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr = bVector;
__m256i aVal, bVal, cVal;
for(number = 0; number < avx_iters; number++)
{
aVal = _mm256_load_si256((__m256i*)aPtr);
bVal = _mm256_load_si256((__m256i*)bPtr);
cVal = _mm256_add_epi8(aVal, bVal);
_mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
aPtr += 32;
bPtr += 32;
cPtr += 32;
}
for(i = avx_iters * 32; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_ORC
extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);

View File

@ -59,6 +59,37 @@
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 16;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m256i tmp;
__m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (i = 0; i < avx2_iters; ++i)
{
tmp = _mm256_loadu_si256((__m256i*)a);
tmp = _mm256_sign_epi8(tmp, conjugator);
_mm256_storeu_si256((__m256i*)c, tmp);
a += 16;
c += 16;
}
for (i = avx2_iters * 16; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX
#include <immintrin.h>
@ -217,6 +248,37 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx2(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 16;
unsigned int i;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m256i tmp;
__m256i conjugator = _mm256_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (i = 0; i < avx2_iters; ++i)
{
tmp = _mm256_load_si256((__m256i*)a);
tmp = _mm256_sign_epi8(tmp, conjugator);
_mm256_store_si256((__m256i*)c, tmp);
a += 16;
c += 16;
}
for (i = avx2_iters * 16; i < num_points; ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_SSSE3
#include <tmmintrin.h>

View File

@ -58,6 +58,56 @@
#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H
#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_H
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
unsigned int i;
__m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
unsigned char* c = cChar;
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(number = 0; number < avx2_iters; number++)
{
x = _mm256_loadu_si256((__m256i*)a);
y = _mm256_loadu_si256((__m256i*)b);
mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
x1 = _mm256_srli_si256(x, 1);
x1 = _mm256_and_si256(x1, mult1);
x2 = _mm256_and_si256(x, mult1);
y1 = _mm256_srli_si256(y, 1);
y1 = _mm256_and_si256(y1, mult1);
y2 = _mm256_and_si256(y, mult1);
x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
tmp = _mm256_and_si256(x1_mult_y1, mult1);
tmp1 = _mm256_slli_si256(tmp, 1);
tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
totalc = _mm256_or_si256(tmp1, tmp2);
_mm256_storeu_si256((__m256i*)c, totalc);
a += 32;
b += 32;
c += 32;
}
for (i = avx2_iters * 32; i < num_points ; ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
@ -176,6 +226,57 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
unsigned int i;
__m256i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
unsigned char* c = cChar;
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(number = 0; number < avx2_iters; number++)
{
x = _mm256_load_si256((__m256i*)a);
y = _mm256_load_si256((__m256i*)b);
mult1 = _mm256_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
x1 = _mm256_srli_si256(x, 1);
x1 = _mm256_and_si256(x1, mult1);
x2 = _mm256_and_si256(x, mult1);
y1 = _mm256_srli_si256(y, 1);
y1 = _mm256_and_si256(y1, mult1);
y2 = _mm256_and_si256(y, mult1);
x1_mult_y1 = _mm256_mullo_epi16(x1, y1);
x2_mult_y2 = _mm256_mullo_epi16(x2, y2);
tmp = _mm256_and_si256(x1_mult_y1, mult1);
tmp1 = _mm256_slli_si256(tmp, 1);
tmp2 = _mm256_and_si256(x2_mult_y2, mult1);
totalc = _mm256_or_si256(tmp1, tmp2);
_mm256_store_si256((__m256i*)c, totalc);
a += 32;
b += 32;
c += 32;
}
for (i = avx2_iters * 32; i < num_points ; ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_ORC
extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);