mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-04-06 18:57:02 +00:00
Merge branch 'next' of https://github.com/gnss-sdr/gnss-sdr into next
This commit is contained in:
commit
8a11105375
@ -1,5 +1,5 @@
|
||||
FIND_PACKAGE(PkgConfig)
|
||||
PKG_CHECK_MODULES(PC_ORC "orc-0.4 > 0.4.11")
|
||||
PKG_CHECK_MODULES(PC_ORC "orc-0.4 > 0.4.22")
|
||||
|
||||
FIND_PROGRAM(ORCC_EXECUTABLE orcc
|
||||
HINTS ${PC_ORC_TOOLSDIR}
|
||||
|
@ -1,5 +1,5 @@
|
||||
FIND_PACKAGE(PkgConfig)
|
||||
PKG_CHECK_MODULES(PC_ORC "orc-0.4 > 0.4.11")
|
||||
PKG_CHECK_MODULES(PC_ORC "orc-0.4 > 0.4.22")
|
||||
|
||||
|
||||
|
||||
|
@ -88,6 +88,7 @@
|
||||
<param>12</param>
|
||||
</check>
|
||||
<flag compiler="gnu">-mfma</flag>
|
||||
<flag compiler="msvc">/arch:AVX2</flag>
|
||||
<alignment>32</alignment>
|
||||
</arch>
|
||||
|
||||
|
@ -14,23 +14,23 @@
|
||||
</machine>
|
||||
|
||||
<machine name="sse3">
|
||||
<archs>generic 32|64 mmx sse sse2 sse3 orc|</archs>
|
||||
<archs>generic 32|64| mmx| sse sse2 sse3 orc|</archs>
|
||||
</machine>
|
||||
|
||||
<machine name="ssse3">
|
||||
<archs>generic 32|64 mmx sse sse2 sse3 ssse3 orc|</archs>
|
||||
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 orc|</archs>
|
||||
</machine>
|
||||
|
||||
<machine name="sse4_a">
|
||||
<archs>generic 32|64 mmx sse sse2 sse3 sse4_a popcount orc|</archs>
|
||||
<archs>generic 32|64| mmx| sse sse2 sse3 sse4_a popcount orc|</archs>
|
||||
</machine>
|
||||
|
||||
<machine name="sse4_1">
|
||||
<archs>generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 orc|</archs>
|
||||
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 orc|</archs>
|
||||
</machine>
|
||||
|
||||
<machine name="sse4_2">
|
||||
<archs>generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount orc|</archs>
|
||||
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount orc|</archs>
|
||||
</machine>
|
||||
|
||||
<!-- trailing | bar means generate without either for MSVC -->
|
||||
|
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_a_sse2(lv_16sc_t* result, co
|
||||
|
||||
lv_16sc_t* _result = result;
|
||||
|
||||
__attribute__((aligned(16))) int local_code_chip_index[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
__m128 _rem_code_phase, _code_phase_step_chips;
|
||||
__m128i _code_length_chips, _code_length_chips_minus1;
|
||||
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||
@ -108,13 +108,13 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_a_sse2(lv_16sc_t* result, co
|
||||
|
||||
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
|
||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
||||
four_times_code_length_chips_minus1[0] = code_length_chips-1;
|
||||
four_times_code_length_chips_minus1[1] = code_length_chips-1;
|
||||
four_times_code_length_chips_minus1[2] = code_length_chips-1;
|
||||
four_times_code_length_chips_minus1[3] = code_length_chips-1;
|
||||
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
||||
four_times_code_length_chips[0] = code_length_chips;
|
||||
four_times_code_length_chips[1] = code_length_chips;
|
||||
four_times_code_length_chips[2] = code_length_chips;
|
||||
@ -127,9 +127,9 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_a_sse2(lv_16sc_t* result, co
|
||||
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
|
||||
__attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__m128 _4output_index = _mm_load_ps(init_idx_float);
|
||||
__attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__m128 _4constant_float = _mm_load_ps(init_4constant_float);
|
||||
|
||||
|
||||
@ -183,7 +183,7 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_u_sse2(lv_16sc_t* result, co
|
||||
|
||||
lv_16sc_t* _result = result;
|
||||
|
||||
__attribute__((aligned(16))) int local_code_chip_index[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
__m128 _rem_code_phase, _code_phase_step_chips;
|
||||
__m128i _code_length_chips, _code_length_chips_minus1;
|
||||
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||
@ -191,13 +191,13 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_u_sse2(lv_16sc_t* result, co
|
||||
|
||||
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
|
||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
||||
four_times_code_length_chips_minus1[0] = code_length_chips-1;
|
||||
four_times_code_length_chips_minus1[1] = code_length_chips-1;
|
||||
four_times_code_length_chips_minus1[2] = code_length_chips-1;
|
||||
four_times_code_length_chips_minus1[3] = code_length_chips-1;
|
||||
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
||||
four_times_code_length_chips[0] = code_length_chips;
|
||||
four_times_code_length_chips[1] = code_length_chips;
|
||||
four_times_code_length_chips[2] = code_length_chips;
|
||||
@ -210,9 +210,9 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_u_sse2(lv_16sc_t* result, co
|
||||
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
|
||||
__attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__m128 _4output_index = _mm_loadu_ps(init_idx_float);
|
||||
__attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
|
||||
|
||||
for(number = 0; number < quarterPoints; number++)
|
||||
@ -265,7 +265,7 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_neon(lv_16sc_t* result, cons
|
||||
|
||||
lv_16sc_t* _result = result;
|
||||
|
||||
__attribute__((aligned(16))) int local_code_chip_index[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
float32x4_t _rem_code_phase, _code_phase_step_chips;
|
||||
int32x4_t _code_length_chips, _code_length_chips_minus1;
|
||||
float32x4_t _code_phase_out, _code_phase_out_with_offset;
|
||||
@ -274,13 +274,13 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_neon(lv_16sc_t* result, cons
|
||||
|
||||
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register
|
||||
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
||||
four_times_code_length_chips[0] = code_length_chips;
|
||||
four_times_code_length_chips[1] = code_length_chips;
|
||||
four_times_code_length_chips[2] = code_length_chips;
|
||||
@ -293,9 +293,9 @@ static inline void volk_gnsssdr_16ic_resampler_16ic_neon(lv_16sc_t* result, cons
|
||||
uint32x4_t negative_indexes, overflow_indexes;
|
||||
int32x4_t zero = vmovq_n_s32(0);
|
||||
|
||||
__attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
float32x4_t _4output_index = vld1q_f32(init_idx_float);
|
||||
__attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
|
||||
|
||||
for(number = 0; number < quarterPoints; number++)
|
||||
|
@ -0,0 +1,282 @@
|
||||
/*!
|
||||
* \file volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h
|
||||
* \brief VOLK_GNSSSDR puppet for the multiple 16-bit complex vector resampler kernel.
|
||||
* \authors <ul>
|
||||
* <li> Carles Fernandez Prades 2016 cfernandez at cttc dot cat
|
||||
* </ul>
|
||||
*
|
||||
* VOLK_GNSSSDR puppet for integrating the multiple resampler into the test system
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
|
||||
*
|
||||
* GNSS-SDR is a software defined Global Navigation
|
||||
* Satellite Systems receiver
|
||||
*
|
||||
* This file is part of GNSS-SDR.
|
||||
*
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef INCLUDED_volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_H
|
||||
#define INCLUDED_volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_H
|
||||
|
||||
#include "volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler2_16ic_xn.h"
|
||||
#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = 0.6;
|
||||
int code_length_chips = 2046;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_16ic_xn_resampler2_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
|
||||
#endif /* LV_HAVE_GENERIC */
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = 0.6;
|
||||
int code_length_chips = 2046;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_16ic_xn_resampler2_16ic_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = 0.6;
|
||||
int code_length_chips = 2046;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_16ic_xn_resampler2_16ic_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = 0.6;
|
||||
int code_length_chips = 2046;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_16ic_xn_resampler2_16ic_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = 0.6;
|
||||
int code_length_chips = 2046;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_16ic_xn_resampler2_16ic_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX
|
||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = 0.6;
|
||||
int code_length_chips = 2046;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_16ic_xn_resampler2_16ic_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX
|
||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = 0.6;
|
||||
int code_length_chips = 2046;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_16ic_xn_resampler2_16ic_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_NEON
|
||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet2_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = 0.6;
|
||||
int code_length_chips = 2046;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_16ic_xn_resampler2_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H
|
@ -141,11 +141,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||
__m128i c1, c2, result;
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||
@ -243,11 +243,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
||||
const unsigned int ROTATOR_RELOAD = 512;
|
||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||
__m128i c1, c2, result;
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||
@ -393,11 +393,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
||||
const unsigned int sse_iters = num_points / 4;
|
||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||
__m128i c1, c2, result;
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
|
||||
@ -494,11 +494,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
|
||||
unsigned int ROTATOR_RELOAD = 512;
|
||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||
__m128i c1, c2, result;
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
|
||||
|
@ -203,11 +203,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
|
||||
// phase rotation registers
|
||||
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
|
||||
__m128i pc1, pc2;
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||
@ -376,11 +376,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
|
||||
// phase rotation registers
|
||||
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
|
||||
__m128i pc1, pc2;
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||
@ -619,11 +619,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
|
||||
// phase rotation registers
|
||||
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
|
||||
__m128i pc1, pc2;
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_loadu_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_loadu_ps((float*)two_phase_acc);
|
||||
@ -780,11 +780,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
|
||||
|
||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||
__m128i c1, c2, result1, result2;
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
|
||||
@ -985,11 +985,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
|
||||
|
||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||
__m128i c1, c2, result1, result2;
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
|
||||
|
@ -0,0 +1,583 @@
|
||||
/*!
|
||||
* \file volk_gnsssdr_16ic_xn_resampler2_16ic_xn.h
|
||||
* \brief VOLK_GNSSSDR kernel: Resamples N 16 bits integer short complex vectors using zero hold resample algorithm.
|
||||
* \authors <ul>
|
||||
* <li> Javier Arribas, 2015. jarribas(at)cttc.es
|
||||
* </ul>
|
||||
*
|
||||
* VOLK_GNSSSDR kernel that resamples N 16 bits integer short complex vectors using zero hold resample algorithm.
|
||||
* It resamples a single GNSS local code signal replica into N vectors fractional-resampled and fractional-delayed
|
||||
* (i.e. it creates the Early, Prompt, and Late code replicas)
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*
|
||||
* Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
|
||||
*
|
||||
* GNSS-SDR is a software defined Global Navigation
|
||||
* Satellite Systems receiver
|
||||
*
|
||||
* This file is part of GNSS-SDR.
|
||||
*
|
||||
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \page volk_gnsssdr_16ic_xn_resampler2_16ic_xn
|
||||
*
|
||||
* \b Overview
|
||||
*
|
||||
* Resamples a complex vector (16-bit integer each component), providing \p num_out_vectors outputs.
|
||||
*
|
||||
* <b>Dispatcher Prototype</b>
|
||||
* \code
|
||||
* void volk_gnsssdr_16ic_xn_resampler2_16ic_xn(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
* \endcode
|
||||
*
|
||||
* \b Inputs
|
||||
* \li local_code: One of the vectors to be multiplied.
|
||||
* \li rem_code_phase_chips: Remnant code phase [chips].
|
||||
* \li code_phase_step_chips: Phase increment per sample [chips/sample].
|
||||
* \li shifts_chips: Vector of floats that defines the spacing (in chips) between the replicas of \p local_code
|
||||
* \li code_length_chips: Code length in chips.
|
||||
* \li num_out_vectors Number of output vectors.
|
||||
* \li num_output_samples: The number of data values to be in the resampled vector.
|
||||
*
|
||||
* \b Outputs
|
||||
* \li result: Pointer to a vector of pointers where the results will be stored.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef INCLUDED_volk_gnsssdr_16ic_xn_resampler2_16ic_xn_H
|
||||
#define INCLUDED_volk_gnsssdr_16ic_xn_resampler2_16ic_xn_H
|
||||
|
||||
#include <math.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler2_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
int local_code_chip_index;
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (int n = 0; n < num_output_samples; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index = local_code_chip_index % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index < 0) local_code_chip_index += code_length_chips;
|
||||
result[current_correlator_tap][n] = local_code[local_code_chip_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler2_16ic_xn_a_sse4_1(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
lv_16sc_t** _result = result;
|
||||
const unsigned int quarterPoints = num_output_samples / 4;
|
||||
|
||||
const __m128 fours = _mm_set1_ps(4.0f);
|
||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m128i zeros = _mm_setzero_si128();
|
||||
const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
|
||||
const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
|
||||
__m128i local_code_chip_index_reg, aux_i, negatives, i;
|
||||
__m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||
for(unsigned int n = 0; n < quarterPoints; n++)
|
||||
{
|
||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
aux = _mm_add_ps(aux, aux2);
|
||||
// floor
|
||||
aux = _mm_floor_ps(aux);
|
||||
|
||||
// fmod
|
||||
c = _mm_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm_cvttps_epi32(c);
|
||||
cTrunc = _mm_cvtepi32_ps(i);
|
||||
base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
|
||||
local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
|
||||
|
||||
negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
|
||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 4; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm_add_ps(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = quarterPoints * 4; n < num_output_samples; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler2_16ic_xn_u_sse4_1(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
lv_16sc_t** _result = result;
|
||||
const unsigned int quarterPoints = num_output_samples / 4;
|
||||
|
||||
const __m128 fours = _mm_set1_ps(4.0f);
|
||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m128i zeros = _mm_setzero_si128();
|
||||
const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
|
||||
const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
|
||||
__m128i local_code_chip_index_reg, aux_i, negatives, i;
|
||||
__m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||
for(unsigned int n = 0; n < quarterPoints; n++)
|
||||
{
|
||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
aux = _mm_add_ps(aux, aux2);
|
||||
// floor
|
||||
aux = _mm_floor_ps(aux);
|
||||
|
||||
// fmod
|
||||
c = _mm_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm_cvttps_epi32(c);
|
||||
cTrunc = _mm_cvtepi32_ps(i);
|
||||
base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
|
||||
local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
|
||||
|
||||
negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
|
||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 4; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm_add_ps(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = quarterPoints * 4; n < num_output_samples; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler2_16ic_xn_a_sse3(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
lv_16sc_t** _result = result;
|
||||
const unsigned int quarterPoints = num_output_samples / 4;
|
||||
|
||||
const __m128 ones = _mm_set1_ps(1.0f);
|
||||
const __m128 fours = _mm_set1_ps(4.0f);
|
||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m128i zeros = _mm_setzero_si128();
|
||||
const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
|
||||
const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
|
||||
__m128i local_code_chip_index_reg, aux_i, negatives, i;
|
||||
__m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||
for(unsigned int n = 0; n < quarterPoints; n++)
|
||||
{
|
||||
//__builtin_prefetch(&_result[current_correlator_tap][4 * n] + 8, 1, 0);
|
||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
aux = _mm_add_ps(aux, aux2);
|
||||
// floor
|
||||
i = _mm_cvttps_epi32(aux);
|
||||
fi = _mm_cvtepi32_ps(i);
|
||||
igx = _mm_cmpgt_ps(fi, aux);
|
||||
j = _mm_and_ps(igx, ones);
|
||||
aux = _mm_sub_ps(fi, j);
|
||||
// fmod
|
||||
c = _mm_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm_cvttps_epi32(c);
|
||||
cTrunc = _mm_cvtepi32_ps(i);
|
||||
base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
|
||||
local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
|
||||
|
||||
negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
|
||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 4; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm_add_ps(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = quarterPoints * 4; n < num_output_samples; n++)
|
||||
{
|
||||
//__builtin_prefetch(&_result[current_correlator_tap][n] + 2, 1, 0);
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler2_16ic_xn_u_sse3(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
lv_16sc_t** _result = result;
|
||||
const unsigned int quarterPoints = num_output_samples / 4;
|
||||
|
||||
const __m128 ones = _mm_set1_ps(1.0f);
|
||||
const __m128 fours = _mm_set1_ps(4.0f);
|
||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m128i zeros = _mm_setzero_si128();
|
||||
const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
|
||||
const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
|
||||
__m128i local_code_chip_index_reg, aux_i, negatives, i;
|
||||
__m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||
for(unsigned int n = 0; n < quarterPoints; n++)
|
||||
{
|
||||
//__builtin_prefetch(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
||||
//__builtin_prefetch(&local_code_chip_index[4]);
|
||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
aux = _mm_add_ps(aux, aux2);
|
||||
// floor
|
||||
i = _mm_cvttps_epi32(aux);
|
||||
fi = _mm_cvtepi32_ps(i);
|
||||
igx = _mm_cmpgt_ps(fi, aux);
|
||||
j = _mm_and_ps(igx, ones);
|
||||
aux = _mm_sub_ps(fi, j);
|
||||
// fmod
|
||||
c = _mm_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm_cvttps_epi32(c);
|
||||
cTrunc = _mm_cvtepi32_ps(i);
|
||||
base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
|
||||
local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
|
||||
|
||||
negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
|
||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 4; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm_add_ps(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = quarterPoints * 4; n < num_output_samples; n++)
|
||||
{
|
||||
//__builtin_prefetch(&_result[current_correlator_tap][n], 1, 0);
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX
|
||||
#include <immintrin.h>
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler2_16ic_xn_a_avx(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
lv_16sc_t** _result = result;
|
||||
const unsigned int avx_iters = num_output_samples / 8;
|
||||
|
||||
const __m256 eights = _mm256_set1_ps(8.0f);
|
||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m256 zeros = _mm256_setzero_ps();
|
||||
const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
|
||||
|
||||
__m256i local_code_chip_index_reg, i;
|
||||
__m256 aux, aux2, shifts_chips_reg, c, cTrunc, base, negatives;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
__m256 indexn = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
|
||||
for(unsigned int n = 0; n < avx_iters; n++)
|
||||
{
|
||||
__builtin_prefetch(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||
__builtin_prefetch(&local_code_chip_index[8]);
|
||||
aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
aux = _mm256_add_ps(aux, aux2);
|
||||
// floor
|
||||
aux = _mm256_floor_ps(aux);
|
||||
|
||||
// fmod
|
||||
c = _mm256_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm256_cvttps_epi32(c);
|
||||
cTrunc = _mm256_cvtepi32_ps(i);
|
||||
base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f);
|
||||
aux = _mm256_sub_ps(aux, base);
|
||||
|
||||
negatives = _mm256_cmp_ps(aux, zeros, 0x01);
|
||||
aux2 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||
local_code_chip_index_reg = _mm256_cvtps_epi32(_mm256_add_ps(aux, aux2));
|
||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 8; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for(unsigned int n = avx_iters * 8; n < num_output_samples; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
__builtin_prefetch(&_result[current_correlator_tap][n], 1, 0);
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX
|
||||
#include <immintrin.h>
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler2_16ic_xn_u_avx(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
lv_16sc_t** _result = result;
|
||||
const unsigned int avx_iters = num_output_samples / 8;
|
||||
|
||||
const __m256 eights = _mm256_set1_ps(8.0f);
|
||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m256 zeros = _mm256_setzero_ps();
|
||||
const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
|
||||
|
||||
__m256i local_code_chip_index_reg, i;
|
||||
__m256 aux, aux2, shifts_chips_reg, c, cTrunc, base, negatives;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
__m256 indexn = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
|
||||
for(unsigned int n = 0; n < avx_iters; n++)
|
||||
{
|
||||
aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
aux = _mm256_add_ps(aux, aux2);
|
||||
// floor
|
||||
aux = _mm256_floor_ps(aux);
|
||||
|
||||
// fmod
|
||||
c = _mm256_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm256_cvttps_epi32(c);
|
||||
cTrunc = _mm256_cvtepi32_ps(i);
|
||||
base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f);
|
||||
aux = _mm256_sub_ps(aux, base);
|
||||
|
||||
negatives = _mm256_cmp_ps(aux, zeros, 0x01);
|
||||
aux2 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||
local_code_chip_index_reg = _mm256_cvtps_epi32(_mm256_add_ps(aux, aux2));
|
||||
|
||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 8; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for(unsigned int n = avx_iters * 8; n < num_output_samples; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_NEON
|
||||
#include <arm_neon.h>
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler2_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
lv_16sc_t** _result = result;
|
||||
const unsigned int neon_iters = num_output_samples / 4;
|
||||
const int32x4_t ones = vdupq_n_s32(1);
|
||||
const float32x4_t fours = vdupq_n_f32(4.0f);
|
||||
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
||||
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
|
||||
int32_t local_code_chip_index_;
|
||||
const int32x4_t zeros = vdupq_n_s32(0);
|
||||
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
|
||||
|
||||
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
||||
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
||||
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
||||
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
uint32x4_t igx;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
indexn = vld1q_f32((float*)vec);
|
||||
for(unsigned int n = 0; n < neon_iters; n++)
|
||||
{
|
||||
__builtin_prefetch(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
||||
__builtin_prefetch(&local_code_chip_index[4]);
|
||||
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
||||
aux = vaddq_f32(aux, aux2);
|
||||
// floor
|
||||
i = vcvtq_s32_f32(aux);
|
||||
fi = vcvtq_f32_s32(i);
|
||||
igx = vcgtq_f32(fi, aux);
|
||||
j = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_u32(igx), ones));
|
||||
aux = vsubq_f32(fi, j);
|
||||
|
||||
// fmod
|
||||
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
||||
c = vmulq_f32(aux, reciprocal);
|
||||
i = vcvtq_s32_f32(c);
|
||||
cTrunc = vcvtq_f32_s32(i);
|
||||
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
||||
aux = vsubq_f32(aux, base);
|
||||
local_code_chip_index_reg = vcvtq_s32_f32(aux);
|
||||
|
||||
negatives = vreinterpretq_s32_u32(vcltq_s32(local_code_chip_index_reg, zeros));
|
||||
aux_i = vandq_s32(code_length_chips_reg_i, negatives);
|
||||
local_code_chip_index_reg = vaddq_s32(local_code_chip_index_reg, aux_i);
|
||||
|
||||
vst1q_s32((int*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 4; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = vaddq_f32(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = neon_iters * 4; n < num_output_samples; n++)
|
||||
{
|
||||
__builtin_prefetch(&_result[current_correlator_tap][n], 1, 0);
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int32_t)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/
|
||||
|
@ -66,15 +66,9 @@
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
|
||||
//#pragma STDC FENV_ACCESS ON
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
|
||||
//int round_int( float r ) {
|
||||
// return (r > 0.0) ? (r + 0.5) : (r - 0.5);
|
||||
//}
|
||||
|
||||
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
int local_code_chip_index;
|
||||
@ -91,7 +85,6 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re
|
||||
result[current_vector][n] = local_code[local_code_chip_index];
|
||||
}
|
||||
}
|
||||
//std::cout<<std::endl;
|
||||
}
|
||||
|
||||
#endif /*LV_HAVE_GENERIC*/
|
||||
@ -102,25 +95,25 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re
|
||||
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||
unsigned int number;
|
||||
const unsigned int quarterPoints = num_output_samples / 4;
|
||||
|
||||
lv_16sc_t** _result = result;
|
||||
__attribute__((aligned(16))) int local_code_chip_index[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
float tmp_rem_code_phase_chips;
|
||||
__m128 _rem_code_phase,_code_phase_step_chips;
|
||||
__m128i _code_length_chips,_code_length_chips_minus1;
|
||||
__m128 _code_phase_out,_code_phase_out_with_offset;
|
||||
|
||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
||||
four_times_code_length_chips[0] = code_length_chips;
|
||||
four_times_code_length_chips[1] = code_length_chips;
|
||||
four_times_code_length_chips[2] = code_length_chips;
|
||||
@ -133,9 +126,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse2(lv_16sc_t** res
|
||||
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
|
||||
__attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__m128 _4output_index = _mm_load_ps(init_idx_float);
|
||||
__attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__m128 _4constant_float = _mm_load_ps(init_4constant_float);
|
||||
|
||||
int current_vector = 0;
|
||||
@ -195,25 +188,25 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse2(lv_16sc_t** res
|
||||
|
||||
static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
{
|
||||
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||
unsigned int number;
|
||||
const unsigned int quarterPoints = num_output_samples / 4;
|
||||
|
||||
lv_16sc_t** _result = result;
|
||||
__attribute__((aligned(16))) int local_code_chip_index[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
float tmp_rem_code_phase_chips;
|
||||
__m128 _rem_code_phase,_code_phase_step_chips;
|
||||
__m128i _code_length_chips,_code_length_chips_minus1;
|
||||
__m128 _code_phase_out,_code_phase_out_with_offset;
|
||||
|
||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
||||
four_times_code_length_chips[0] = code_length_chips;
|
||||
four_times_code_length_chips[1] = code_length_chips;
|
||||
four_times_code_length_chips[2] = code_length_chips;
|
||||
@ -226,9 +219,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse2(lv_16sc_t** res
|
||||
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
|
||||
__attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__m128 _4output_index = _mm_loadu_ps(init_idx_float);
|
||||
__attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
|
||||
|
||||
int current_vector = 0;
|
||||
@ -294,7 +287,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
||||
float32x4_t half = vdupq_n_f32(0.5f);
|
||||
|
||||
lv_16sc_t** _result = result;
|
||||
__attribute__((aligned(16))) int local_code_chip_index[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
float tmp_rem_code_phase_chips;
|
||||
float32x4_t _rem_code_phase, _code_phase_step_chips;
|
||||
int32x4_t _code_length_chips, _code_length_chips_minus1;
|
||||
@ -302,13 +295,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
||||
float32x4_t sign, PlusHalf, Round;
|
||||
|
||||
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||
|
||||
__attribute__((aligned(16))) int four_times_code_length_chips[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
||||
four_times_code_length_chips[0] = code_length_chips;
|
||||
four_times_code_length_chips[1] = code_length_chips;
|
||||
four_times_code_length_chips[2] = code_length_chips;
|
||||
@ -321,9 +314,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
||||
uint32x4_t negative_indexes, overflow_indexes;
|
||||
int32x4_t zero = vmovq_n_s32(0);
|
||||
|
||||
__attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
float32x4_t _4output_index = vld1q_f32(init_idx_float);
|
||||
__attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
||||
float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
|
||||
|
||||
int current_vector = 0;
|
||||
|
@ -268,26 +268,26 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
|
||||
__m128i emm0, emm2, emm4;
|
||||
|
||||
/* declare some SSE constants */
|
||||
static const int _ps_inv_sign_mask[4] __attribute__((aligned(16))) = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
static const int _ps_sign_mask[4] __attribute__((aligned(16))) = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
|
||||
static const float _ps_cephes_FOPI[4] __attribute__((aligned(16))) = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
static const int _pi32_1[4] __attribute__((aligned(16))) = { 1, 1, 1, 1 };
|
||||
static const int _pi32_inv1[4] __attribute__((aligned(16))) = { ~1, ~1, ~1, ~1 };
|
||||
static const int _pi32_2[4] __attribute__((aligned(16))) = { 2, 2, 2, 2};
|
||||
static const int _pi32_4[4] __attribute__((aligned(16))) = { 4, 4, 4, 4};
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2};
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4};
|
||||
|
||||
static const float _ps_minus_cephes_DP1[4] __attribute__((aligned(16))) = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
static const float _ps_minus_cephes_DP2[4] __attribute__((aligned(16))) = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
static const float _ps_minus_cephes_DP3[4] __attribute__((aligned(16))) = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
static const float _ps_coscof_p0[4] __attribute__((aligned(16))) = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
static const float _ps_coscof_p1[4] __attribute__((aligned(16))) = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
static const float _ps_coscof_p2[4] __attribute__((aligned(16))) = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
static const float _ps_sincof_p0[4] __attribute__((aligned(16))) = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
static const float _ps_sincof_p1[4] __attribute__((aligned(16))) = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
static const float _ps_sincof_p2[4] __attribute__((aligned(16))) = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
static const float _ps_0p5[4] __attribute__((aligned(16))) = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const float _ps_1[4] __attribute__((aligned(16))) = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
|
||||
for(;number < sse_iters; number++)
|
||||
{
|
||||
@ -421,26 +421,26 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
|
||||
__m128i emm0, emm2, emm4;
|
||||
|
||||
/* declare some SSE constants */
|
||||
static const int _ps_inv_sign_mask[4] __attribute__((aligned(16))) = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
static const int _ps_sign_mask[4] __attribute__((aligned(16))) = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
|
||||
static const float _ps_cephes_FOPI[4] __attribute__((aligned(16))) = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
static const int _pi32_1[4] __attribute__((aligned(16))) = { 1, 1, 1, 1 };
|
||||
static const int _pi32_inv1[4] __attribute__((aligned(16))) = { ~1, ~1, ~1, ~1 };
|
||||
static const int _pi32_2[4] __attribute__((aligned(16))) = { 2, 2, 2, 2};
|
||||
static const int _pi32_4[4] __attribute__((aligned(16))) = { 4, 4, 4, 4};
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2};
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4};
|
||||
|
||||
static const float _ps_minus_cephes_DP1[4] __attribute__((aligned(16))) = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
static const float _ps_minus_cephes_DP2[4] __attribute__((aligned(16))) = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
static const float _ps_minus_cephes_DP3[4] __attribute__((aligned(16))) = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
static const float _ps_coscof_p0[4] __attribute__((aligned(16))) = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
static const float _ps_coscof_p1[4] __attribute__((aligned(16))) = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
static const float _ps_coscof_p2[4] __attribute__((aligned(16))) = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
static const float _ps_sincof_p0[4] __attribute__((aligned(16))) = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
static const float _ps_sincof_p1[4] __attribute__((aligned(16))) = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
static const float _ps_sincof_p2[4] __attribute__((aligned(16))) = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
static const float _ps_0p5[4] __attribute__((aligned(16))) = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const float _ps_1[4] __attribute__((aligned(16))) = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
|
||||
for(;number < sse_iters; number++)
|
||||
{
|
||||
|
@ -116,7 +116,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re
|
||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||
|
||||
@ -146,7 +146,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t*
|
||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||
|
||||
@ -233,7 +233,36 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
|
||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
volk_gnsssdr_free(result_aux[n]);
|
||||
}
|
||||
volk_gnsssdr_free(result_aux);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_NEON
|
||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||
{
|
||||
float code_phase_step_chips = 0.6;
|
||||
int code_length_chips = 2046;
|
||||
int num_out_vectors = 3;
|
||||
float rem_code_phase_chips = -0.234;
|
||||
|
||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
||||
|
||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||
for(unsigned int n = 0; n < num_out_vectors; n++)
|
||||
{
|
||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||
}
|
||||
|
||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||
|
||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||
|
||||
|
@ -183,11 +183,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
||||
// phase rotation registers
|
||||
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
|
||||
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||
@ -289,11 +289,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
||||
// phase rotation registers
|
||||
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
|
||||
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_inc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
||||
two_phase_inc[0] = phase_inc * phase_inc;
|
||||
two_phase_inc[1] = phase_inc * phase_inc;
|
||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
||||
__attribute__((aligned(16))) lv_32fc_t two_phase_acc[2];
|
||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
||||
two_phase_acc[0] = (*phase);
|
||||
two_phase_acc[1] = (*phase) * phase_inc;
|
||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||
|
@ -1,12 +1,12 @@
|
||||
/*!
|
||||
* \file volk_gnsssdr_16ic_xn_resampler_16ic_xn.h
|
||||
* \brief VOLK_GNSSSDR kernel: Resamples N 16 bits integer short complex vectors using zero hold resample algorithm.
|
||||
* \file volk_gnsssdr_32fc_xn_resampler_32fc_xn.h
|
||||
* \brief VOLK_GNSSSDR kernel: Resamples N complex 32-bit float vectors using zero hold resample algorithm.
|
||||
* \authors <ul>
|
||||
* <li> Javier Arribas, 2015. jarribas(at)cttc.es
|
||||
* </ul>
|
||||
*
|
||||
* VOLK_GNSSSDR kernel that esamples N 16 bits integer short complex vectors using zero hold resample algorithm.
|
||||
* It is optimized to resample a sigle GNSS local code signal replica into N vectors fractional-resampled and fractional-delayed
|
||||
* VOLK_GNSSSDR kernel that esamples N complex 32-bit float vectors using zero hold resample algorithm.
|
||||
* It is optimized to resample a single GNSS local code signal replica into N vectors fractional-resampled and fractional-delayed
|
||||
* (i.e. it creates the Early, Prompt, and Late code replicas)
|
||||
*
|
||||
* -------------------------------------------------------------------------
|
||||
@ -35,24 +35,25 @@
|
||||
*/
|
||||
|
||||
/*!
|
||||
* \page volk_gnsssdr_16ic_xn_resampler_16ic_xn
|
||||
* \page volk_gnsssdr_32fc_xn_resampler_32fc_xn
|
||||
*
|
||||
* \b Overview
|
||||
*
|
||||
* Resamples a complex vector (16-bit integer each component), providing \p num_out_vectors outputs.
|
||||
* Resamples a complex vector (32-bit float each component), providing \p num_out_vectors outputs.
|
||||
*
|
||||
* <b>Dispatcher Prototype</b>
|
||||
* \code
|
||||
* void volk_gnsssdr_16ic_xn_resampler_16ic_xn(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
* void volk_gnsssdr_32fc_xn_resampler_32fc_xn(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
* \endcode
|
||||
*
|
||||
* \b Inputs
|
||||
* \li local_code: One of the vectors to be multiplied.
|
||||
* \li rem_code_phase_chips: Remnant code phase [chips].
|
||||
* \li code_phase_step_chips: Phase increment per sample [chips/sample].
|
||||
* \li shifts_chips: Vector of floats that defines the spacing (in chips) between the replicas of \p local_code
|
||||
* \li code_length_chips: Code length in chips.
|
||||
* \li num_out_vectors Number of output vectors.
|
||||
* \li num_output_samples: The number of data values to be in the resampled vector.
|
||||
* \li num_points: The number of data values to be in the resampled vector.
|
||||
*
|
||||
* \b Outputs
|
||||
* \li result: Pointer to a vector of pointers where the results will be stored.
|
||||
@ -66,17 +67,16 @@
|
||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
|
||||
|
||||
//#pragma STDC FENV_ACCESS ON
|
||||
|
||||
|
||||
#ifdef LV_HAVE_GENERIC
|
||||
|
||||
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
int local_code_chip_index;
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for (int n = 0; n < num_output_samples; n++)
|
||||
for (int n = 0; n < num_points; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
@ -93,17 +93,17 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** re
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t** _result = result;
|
||||
const unsigned int quarterPoints = num_output_samples / 4;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const __m128 ones = _mm_set1_ps(1.0f);
|
||||
const __m128 fours = _mm_set1_ps(4.0f);
|
||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||
|
||||
__attribute__((aligned(16))) int local_code_chip_index[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m128i zeros = _mm_setzero_si128();
|
||||
@ -144,7 +144,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
|
||||
}
|
||||
indexn = _mm_add_ps(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = quarterPoints * 4; n < num_output_samples; n++)
|
||||
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
@ -153,23 +153,91 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE3
|
||||
#include <pmmintrin.h>
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t** _result = result;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const __m128 ones = _mm_set1_ps(1.0f);
|
||||
const __m128 fours = _mm_set1_ps(4.0f);
|
||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m128i zeros = _mm_setzero_si128();
|
||||
const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
|
||||
const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
|
||||
__m128i local_code_chip_index_reg, aux_i, negatives, i;
|
||||
__m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||
for(unsigned int n = 0; n < quarterPoints; n++)
|
||||
{
|
||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
aux = _mm_add_ps(aux, aux2);
|
||||
// floor
|
||||
i = _mm_cvttps_epi32(aux);
|
||||
fi = _mm_cvtepi32_ps(i);
|
||||
igx = _mm_cmpgt_ps(fi, aux);
|
||||
j = _mm_and_ps(igx, ones);
|
||||
aux = _mm_sub_ps(fi, j);
|
||||
// fmod
|
||||
c = _mm_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm_cvttps_epi32(c);
|
||||
cTrunc = _mm_cvtepi32_ps(i);
|
||||
base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
|
||||
local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
|
||||
|
||||
negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
|
||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 4; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm_add_ps(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t** _result = result;
|
||||
const unsigned int quarterPoints = num_output_samples / 4;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const __m128 fours = _mm_set1_ps(4.0f);
|
||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||
|
||||
__attribute__((aligned(16))) int local_code_chip_index[4];
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m128i zeros = _mm_setzero_si128();
|
||||
@ -207,7 +275,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
|
||||
}
|
||||
indexn = _mm_add_ps(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = quarterPoints * 4; n < num_output_samples; n++)
|
||||
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
@ -216,24 +284,88 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t** _result = result;
|
||||
const unsigned int quarterPoints = num_points / 4;
|
||||
|
||||
const __m128 fours = _mm_set1_ps(4.0f);
|
||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m128i zeros = _mm_setzero_si128();
|
||||
const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips);
|
||||
const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips);
|
||||
__m128i local_code_chip_index_reg, aux_i, negatives, i;
|
||||
__m128 aux, aux2, shifts_chips_reg, c, cTrunc, base;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||
for(unsigned int n = 0; n < quarterPoints; n++)
|
||||
{
|
||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
aux = _mm_add_ps(aux, aux2);
|
||||
// floor
|
||||
aux = _mm_floor_ps(aux);
|
||||
|
||||
// fmod
|
||||
c = _mm_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm_cvttps_epi32(c);
|
||||
cTrunc = _mm_cvtepi32_ps(i);
|
||||
base = _mm_mul_ps(cTrunc, code_length_chips_reg_f);
|
||||
local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base));
|
||||
|
||||
negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros);
|
||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 4; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm_add_ps(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = quarterPoints * 4; n < num_points; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_AVX
|
||||
#include <immintrin.h>
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t** _result = result;
|
||||
const unsigned int avx_iters = num_output_samples / 8;
|
||||
const unsigned int avx_iters = num_points / 8;
|
||||
|
||||
const __m256 eights = _mm256_set1_ps(8.0f);
|
||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||
|
||||
__attribute__((aligned(32))) int local_code_chip_index[8];
|
||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m256 zeros = _mm256_setzero_ps();
|
||||
@ -271,8 +403,11 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
|
||||
}
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
for(unsigned int n = avx_iters * 8; n < num_output_samples; n++)
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for(unsigned int n = avx_iters * 8; n < num_points; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
@ -281,10 +416,157 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/
|
||||
|
||||
#ifdef LV_HAVE_AVX
|
||||
#include <immintrin.h>
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t** _result = result;
|
||||
const unsigned int avx_iters = num_points / 8;
|
||||
|
||||
const __m256 eights = _mm256_set1_ps(8.0f);
|
||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
||||
int local_code_chip_index_;
|
||||
|
||||
const __m256 zeros = _mm256_setzero_ps();
|
||||
const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips);
|
||||
|
||||
__m256i local_code_chip_index_reg, i;
|
||||
__m256 aux, aux2, shifts_chips_reg, c, cTrunc, base, negatives;
|
||||
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
__m256 indexn = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f);
|
||||
for(unsigned int n = 0; n < avx_iters; n++)
|
||||
{
|
||||
aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn);
|
||||
aux = _mm256_add_ps(aux, aux2);
|
||||
// floor
|
||||
aux = _mm256_floor_ps(aux);
|
||||
|
||||
// fmod
|
||||
c = _mm256_div_ps(aux, code_length_chips_reg_f);
|
||||
i = _mm256_cvttps_epi32(c);
|
||||
cTrunc = _mm256_cvtepi32_ps(i);
|
||||
base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f);
|
||||
aux = _mm256_sub_ps(aux, base);
|
||||
|
||||
negatives = _mm256_cmp_ps(aux, zeros, 0x01);
|
||||
aux2 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||
local_code_chip_index_reg = _mm256_cvtps_epi32(_mm256_add_ps(aux, aux2));
|
||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||
for(unsigned int k = 0; k < 8; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = _mm256_add_ps(indexn, eights);
|
||||
}
|
||||
}
|
||||
_mm256_zeroupper();
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
for(unsigned int n = avx_iters * 8; n < num_points; n++)
|
||||
{
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef LV_HAVE_NEON
|
||||
#include <arm_neon.h>
|
||||
|
||||
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
|
||||
{
|
||||
lv_32fc_t** _result = result;
|
||||
const unsigned int neon_iters = num_points / 4;
|
||||
const int32x4_t ones = vdupq_n_s32(1);
|
||||
const float32x4_t fours = vdupq_n_f32(4.0f);
|
||||
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
||||
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
||||
|
||||
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
|
||||
int32_t local_code_chip_index_;
|
||||
|
||||
const int32x4_t zeros = vdupq_n_s32(0);
|
||||
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
|
||||
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
||||
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
||||
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
||||
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
||||
uint32x4_t igx;
|
||||
for (int current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||
{
|
||||
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
||||
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||
indexn = vld1q_f32((float*)vec);
|
||||
for(unsigned int n = 0; n < neon_iters; n++)
|
||||
{
|
||||
__builtin_prefetch(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
||||
__builtin_prefetch(&local_code_chip_index[4]);
|
||||
aux = vmulq_f32(code_phase_step_chips_reg, indexn);
|
||||
aux = vaddq_f32(aux, aux2);
|
||||
// floor
|
||||
i = vcvtq_s32_f32(aux);
|
||||
fi = vcvtq_f32_s32(i);
|
||||
igx = vcgtq_f32(fi, aux);
|
||||
j = vcvtq_f32_s32(vandq_s32(vreinterpretq_s32_u32(igx), ones));
|
||||
aux = vsubq_f32(fi, j);
|
||||
// fmod
|
||||
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
||||
c = vmulq_f32(aux, reciprocal);
|
||||
i = vcvtq_s32_f32(c);
|
||||
cTrunc = vcvtq_f32_s32(i);
|
||||
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
||||
aux = vsubq_f32(aux, base);
|
||||
local_code_chip_index_reg = vcvtq_s32_f32(aux);
|
||||
|
||||
negatives = vreinterpretq_s32_u32(vcltq_s32(local_code_chip_index_reg, zeros));
|
||||
aux_i = vandq_s32(code_length_chips_reg_i, negatives);
|
||||
local_code_chip_index_reg = vaddq_s32(local_code_chip_index_reg, aux_i);
|
||||
|
||||
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
|
||||
|
||||
for(unsigned int k = 0; k < 4; ++k)
|
||||
{
|
||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||
}
|
||||
indexn = vaddq_f32(indexn, fours);
|
||||
}
|
||||
for(unsigned int n = neon_iters * 4; n < num_points; n++)
|
||||
{
|
||||
__builtin_prefetch(&_result[current_correlator_tap][n], 1, 0);
|
||||
// resample code for current tap
|
||||
local_code_chip_index_ = (int32_t)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||
//Take into account that in multitap correlators, the shifts can be negative!
|
||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += code_length_chips;
|
||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/
|
||||
|
||||
|
@ -82,29 +82,29 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
|
||||
__m128i emm0, emm2, emm4;
|
||||
|
||||
/* declare some SSE constants */
|
||||
static const int _ps_inv_sign_mask[4] __attribute__((aligned(16))) = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
static const int _ps_sign_mask[4] __attribute__((aligned(16))) = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
|
||||
static const float _ps_cephes_FOPI[4] __attribute__((aligned(16))) = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
static const int _pi32_1[4] __attribute__((aligned(16))) = { 1, 1, 1, 1 };
|
||||
static const int _pi32_inv1[4] __attribute__((aligned(16))) = { ~1, ~1, ~1, ~1 };
|
||||
static const int _pi32_2[4] __attribute__((aligned(16))) = { 2, 2, 2, 2};
|
||||
static const int _pi32_4[4] __attribute__((aligned(16))) = { 4, 4, 4, 4};
|
||||
static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
static const int _pi32_1[4] = { 1, 1, 1, 1 };
|
||||
static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
|
||||
static const int _pi32_2[4] = { 2, 2, 2, 2};
|
||||
static const int _pi32_4[4] = { 4, 4, 4, 4};
|
||||
|
||||
static const float _ps_minus_cephes_DP1[4] __attribute__((aligned(16))) = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
static const float _ps_minus_cephes_DP2[4] __attribute__((aligned(16))) = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
static const float _ps_minus_cephes_DP3[4] __attribute__((aligned(16))) = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
static const float _ps_coscof_p0[4] __attribute__((aligned(16))) = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
static const float _ps_coscof_p1[4] __attribute__((aligned(16))) = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
static const float _ps_coscof_p2[4] __attribute__((aligned(16))) = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
static const float _ps_sincof_p0[4] __attribute__((aligned(16))) = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
static const float _ps_sincof_p1[4] __attribute__((aligned(16))) = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
static const float _ps_sincof_p2[4] __attribute__((aligned(16))) = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
static const float _ps_0p5[4] __attribute__((aligned(16))) = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const float _ps_1[4] __attribute__((aligned(16))) = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
|
||||
float four_phases[4] __attribute__((aligned(16))) = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
|
||||
float four_phases_inc[4] __attribute__((aligned(16))) = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc };
|
||||
float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
|
||||
float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc };
|
||||
four_phases_reg = _mm_load_ps(four_phases);
|
||||
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
|
||||
|
||||
@ -239,29 +239,29 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
|
||||
__m128i emm0, emm2, emm4;
|
||||
|
||||
/* declare some SSE constants */
|
||||
static const int _ps_inv_sign_mask[4] __attribute__((aligned(16))) = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
static const int _ps_sign_mask[4] __attribute__((aligned(16))) = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
|
||||
static const float _ps_cephes_FOPI[4] __attribute__((aligned(16))) = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
static const int _pi32_1[4] __attribute__((aligned(16))) = { 1, 1, 1, 1 };
|
||||
static const int _pi32_inv1[4] __attribute__((aligned(16))) = { ~1, ~1, ~1, ~1 };
|
||||
static const int _pi32_2[4] __attribute__((aligned(16))) = { 2, 2, 2, 2};
|
||||
static const int _pi32_4[4] __attribute__((aligned(16))) = { 4, 4, 4, 4};
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2};
|
||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4};
|
||||
|
||||
static const float _ps_minus_cephes_DP1[4] __attribute__((aligned(16))) = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
static const float _ps_minus_cephes_DP2[4] __attribute__((aligned(16))) = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
static const float _ps_minus_cephes_DP3[4] __attribute__((aligned(16))) = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
static const float _ps_coscof_p0[4] __attribute__((aligned(16))) = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
static const float _ps_coscof_p1[4] __attribute__((aligned(16))) = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
static const float _ps_coscof_p2[4] __attribute__((aligned(16))) = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
static const float _ps_sincof_p0[4] __attribute__((aligned(16))) = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
static const float _ps_sincof_p1[4] __attribute__((aligned(16))) = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
static const float _ps_sincof_p2[4] __attribute__((aligned(16))) = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
static const float _ps_0p5[4] __attribute__((aligned(16))) = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const float _ps_1[4] __attribute__((aligned(16))) = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
|
||||
float four_phases[4] __attribute__((aligned(16))) = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
|
||||
float four_phases_inc[4] __attribute__((aligned(16))) = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc };
|
||||
__VOLK_ATTR_ALIGNED(16) float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
|
||||
__VOLK_ATTR_ALIGNED(16) float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc };
|
||||
four_phases_reg = _mm_load_ps(four_phases);
|
||||
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
|
||||
|
||||
@ -452,29 +452,29 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
|
||||
__m128 aux, c1, s1;
|
||||
|
||||
/* declare some AXX2 constants */
|
||||
static const int _ps_inv_sign_mask[8] __attribute__((aligned(32))) = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
static const int _ps_sign_mask[8] __attribute__((aligned(32))) = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
|
||||
static const float _ps_cephes_FOPI[8] __attribute__((aligned(32))) = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
static const int _pi32_1[8] __attribute__((aligned(32))) = { 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
static const int _pi32_inv1[8] __attribute__((aligned(32))) = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 };
|
||||
static const int _pi32_2[8] __attribute__((aligned(32))) = { 2, 2, 2, 2, 2, 2, 2, 2 };
|
||||
static const int _pi32_4[8] __attribute__((aligned(32))) = { 4, 4, 4, 4, 4, 4, 4, 4 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 };
|
||||
|
||||
static const float _ps_minus_cephes_DP1[8] __attribute__((aligned(32))) = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
static const float _ps_minus_cephes_DP2[8] __attribute__((aligned(32))) = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
static const float _ps_minus_cephes_DP3[8] __attribute__((aligned(32))) = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
static const float _ps_coscof_p0[8] __attribute__((aligned(32))) = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
static const float _ps_coscof_p1[8] __attribute__((aligned(32))) = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
static const float _ps_coscof_p2[8] __attribute__((aligned(32))) = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
static const float _ps_sincof_p0[8] __attribute__((aligned(32))) = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
static const float _ps_sincof_p1[8] __attribute__((aligned(32))) = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
static const float _ps_sincof_p2[8] __attribute__((aligned(32))) = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
static const float _ps_0p5[8] __attribute__((aligned(32))) = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const float _ps_1[8] __attribute__((aligned(32))) = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
|
||||
float eight_phases[8] __attribute__((aligned(32))) = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc };
|
||||
float eight_phases_inc[8] __attribute__((aligned(32))) = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc };
|
||||
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc };
|
||||
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc };
|
||||
eight_phases_reg = _mm256_load_ps(eight_phases);
|
||||
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
|
||||
|
||||
@ -620,29 +620,29 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
|
||||
__m128 aux, c1, s1;
|
||||
|
||||
/* declare some AXX2 constants */
|
||||
static const int _ps_inv_sign_mask[8] __attribute__((aligned(32))) = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
static const int _ps_sign_mask[8] __attribute__((aligned(32))) = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
||||
|
||||
static const float _ps_cephes_FOPI[8] __attribute__((aligned(32))) = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
static const int _pi32_1[8] __attribute__((aligned(32))) = { 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
static const int _pi32_inv1[8] __attribute__((aligned(32))) = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 };
|
||||
static const int _pi32_2[8] __attribute__((aligned(32))) = { 2, 2, 2, 2, 2, 2, 2, 2 };
|
||||
static const int _pi32_4[8] __attribute__((aligned(32))) = { 4, 4, 4, 4, 4, 4, 4, 4 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 };
|
||||
|
||||
static const float _ps_minus_cephes_DP1[8] __attribute__((aligned(32))) = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
static const float _ps_minus_cephes_DP2[8] __attribute__((aligned(32))) = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
static const float _ps_minus_cephes_DP3[8] __attribute__((aligned(32))) = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
static const float _ps_coscof_p0[8] __attribute__((aligned(32))) = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
static const float _ps_coscof_p1[8] __attribute__((aligned(32))) = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
static const float _ps_coscof_p2[8] __attribute__((aligned(32))) = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
static const float _ps_sincof_p0[8] __attribute__((aligned(32))) = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
static const float _ps_sincof_p1[8] __attribute__((aligned(32))) = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
static const float _ps_sincof_p2[8] __attribute__((aligned(32))) = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
static const float _ps_0p5[8] __attribute__((aligned(32))) = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
static const float _ps_1[8] __attribute__((aligned(32))) = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
|
||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
|
||||
|
||||
float eight_phases[8] __attribute__((aligned(32))) = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc };
|
||||
float eight_phases_inc[8] __attribute__((aligned(32))) = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc };
|
||||
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc };
|
||||
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc };
|
||||
eight_phases_reg = _mm256_load_ps(eight_phases);
|
||||
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
|
||||
|
||||
|
@ -150,7 +150,12 @@ set(HAVE_XGETBV 0)
|
||||
set(HAVE_AVX_CVTPI32_PS 0)
|
||||
if(CPU_IS_x86)
|
||||
# check to see if the compiler/linker works with xgetb instruction
|
||||
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c "unsigned long long _xgetbv(unsigned int index) { unsigned int eax, edx; __asm__ __volatile__(\"xgetbv\" : \"=a\"(eax), \"=d\"(edx) : \"c\"(index)); return ((unsigned long long)edx << 32) | eax; } int main (void) { (void) _xgetbv(0); return (0); }")
|
||||
if (NOT MSVC)
|
||||
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c "unsigned long long _xgetbv(unsigned int index) { unsigned int eax, edx; __asm__ __volatile__(\"xgetbv\" : \"=a\"(eax), \"=d\"(edx) : \"c\"(index)); return ((unsigned long long)edx << 32) | eax; } int main (void) { (void) _xgetbv(0); return (0); }")
|
||||
else (NOT MSVC)
|
||||
#MSVC defines an intrinsic
|
||||
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c "#include <stdio.h> \n #include <intrin.h> \n int main() { int avxSupported = 0; \n#if (_MSC_FULL_VER >= 160040219) \nint cpuInfo[4]; __cpuid(cpuInfo, 1);\nif ((cpuInfo[2] & (1 << 27) || 0) && (cpuInfo[2] & (1 << 28) || 0)) \n{\nunsigned long long xcrFeatureMask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);\n avxSupported = (xcrFeatureMask & 0x6) == 6;}\n#endif \n return 1- avxSupported; }")
|
||||
endif(NOT MSVC)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -o
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c
|
||||
@ -179,28 +184,33 @@ if(CPU_IS_x86)
|
||||
#########################################################################
|
||||
|
||||
# check to see if the compiler/linker works with cvtpi32_ps instrinsic when using AVX
|
||||
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c "#include <immintrin.h>\nint main (void) {__m128 __a; __m64 __b; __m128 foo = _mm_cvtpi32_ps(__a, __b); return (0); }")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -mavx -o
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c
|
||||
OUTPUT_QUIET ERROR_QUIET
|
||||
RESULT_VARIABLE avx_compile_result)
|
||||
if(NOT ${avx_compile_result} EQUAL 0)
|
||||
OVERRULE_ARCH(avx "Compiler missing cvtpi32_ps instrinsic")
|
||||
elseif(NOT CROSSCOMPILE_MULTILIB)
|
||||
execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
|
||||
OUTPUT_QUIET ERROR_QUIET
|
||||
RESULT_VARIABLE avx_exe_result)
|
||||
if(NOT ${avx_exe_result} EQUAL 0)
|
||||
OVERRULE_ARCH(avx "CPU missing cvtpi32_ps")
|
||||
else()
|
||||
set(HAVE_AVX_CVTPI32_PS 1)
|
||||
endif()
|
||||
else()
|
||||
set(HAVE_AVX_CVTPI32_PS 1)
|
||||
endif()
|
||||
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c)
|
||||
if (CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c "#include <immintrin.h>\nint main (void) {__m128 __a; __m64 __b; __m128 foo = _mm_cvtpi32_ps(__a, __b); return (0); }")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -mavx -o
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c
|
||||
OUTPUT_QUIET ERROR_QUIET
|
||||
RESULT_VARIABLE avx_compile_result)
|
||||
if(NOT ${avx_compile_result} EQUAL 0)
|
||||
OVERRULE_ARCH(avx "Compiler missing cvtpi32_ps instrinsic")
|
||||
elseif(NOT CROSSCOMPILE_MULTILIB)
|
||||
execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
|
||||
OUTPUT_QUIET ERROR_QUIET
|
||||
RESULT_VARIABLE avx_exe_result)
|
||||
if(NOT ${avx_exe_result} EQUAL 0)
|
||||
OVERRULE_ARCH(avx "CPU missing cvtpi32_ps")
|
||||
else()
|
||||
set(HAVE_AVX_CVTPI32_PS 1)
|
||||
endif()
|
||||
else()
|
||||
set(HAVE_AVX_CVTPI32_PS 1)
|
||||
endif()
|
||||
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
|
||||
${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c)
|
||||
else(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
# 64-bit compilations won't need this command so don't overrule AVX
|
||||
set(HAVE_AVX_CVTPI32_PS 0)
|
||||
endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
|
||||
# Disable SSE4a if Clang is less than version 3.2
|
||||
if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
|
||||
|
@ -87,6 +87,7 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
|
||||
(VOLK_INIT_PUPP(volk_gnsssdr_16ic_rotatorpuppet_16ic, volk_gnsssdr_16ic_s32fc_x2_rotator_16ic, test_params_int1))
|
||||
(VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerpuppet_16ic, volk_gnsssdr_16ic_resampler_16ic, test_params))
|
||||
(VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerxnpuppet_16ic, volk_gnsssdr_16ic_xn_resampler_16ic_xn, test_params))
|
||||
(VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerxnpuppet2_16ic, volk_gnsssdr_16ic_xn_resampler2_16ic_xn, test_params))
|
||||
(VOLK_INIT_PUPP(volk_gnsssdr_32fc_resamplerxnpuppet_32fc, volk_gnsssdr_32fc_xn_resampler_32fc_xn, test_params))
|
||||
(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_dot_prod_16ic_xn, test_params))
|
||||
(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16))
|
||||
|
@ -211,6 +211,11 @@ gps_l1_ca_dll_pll_c_aid_tracking_cc::gps_l1_ca_dll_pll_c_aid_tracking_cc(
|
||||
d_enable_extended_integration = false;
|
||||
d_preamble_synchronized = false;
|
||||
d_correlation_symbol_counter = 0;
|
||||
d_rem_code_phase_integer_samples = 0.0;
|
||||
d_code_error_chips_Ti = 0.0;
|
||||
d_code_error_filt_chips_s = 0.0;
|
||||
d_carr_phase_error_secs_Ti = 0.0;
|
||||
d_preamble_timestamp_s = 0.0;
|
||||
//set_min_output_buffer((long int)300);
|
||||
}
|
||||
|
||||
@ -377,8 +382,8 @@ int gps_l1_ca_dll_pll_c_aid_tracking_cc::general_work (int noutput_items, gr_vec
|
||||
bool enable_dll_pll;
|
||||
if (d_enable_extended_integration == true)
|
||||
{
|
||||
long int symbol_diff = round(1000.0*((static_cast<double>(d_sample_counter) + d_rem_code_phase_samples) / static_cast<double>(d_fs_in)-d_preamble_timestamp_s));
|
||||
if (symbol_diff>0 and symbol_diff % d_extend_correlation_ms == 0)
|
||||
long int symbol_diff = round(1000.0 * ((static_cast<double>(d_sample_counter) + d_rem_code_phase_samples) / static_cast<double>(d_fs_in) - d_preamble_timestamp_s));
|
||||
if (symbol_diff > 0 and symbol_diff % d_extend_correlation_ms == 0)
|
||||
{
|
||||
// compute coherent integration and enable tracking loop
|
||||
// perform coherent integration using correlator output history
|
||||
@ -399,12 +404,13 @@ int gps_l1_ca_dll_pll_c_aid_tracking_cc::general_work (int noutput_items, gr_vec
|
||||
d_carrier_loop_filter.set_params(10.0, d_pll_bw_narrow_hz,2);
|
||||
d_preamble_synchronized = true;
|
||||
std::cout << "Enabled extended correlator for CH "<< d_channel <<" : Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN)
|
||||
<<" dll_narrow_bw=" << d_dll_bw_narrow_hz << " pll_narrow_bw=" << d_pll_bw_narrow_hz << std::endl;
|
||||
<<" pll_bw = " << d_pll_bw_hz << " [Hz], pll_narrow_bw = " << d_pll_bw_narrow_hz << " [Hz]" << std::endl
|
||||
<< "Enabled extended correlator for CH "<< d_channel <<" : Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN)
|
||||
<<" dll_bw = " << d_dll_bw_hz << " [Hz], dll_narrow_bw = " << d_dll_bw_narrow_hz << " [Hz]" << std::endl;
|
||||
}
|
||||
// UPDATE INTEGRATION TIME
|
||||
CURRENT_INTEGRATION_TIME_S = static_cast<double>(d_extend_correlation_ms) * GPS_L1_CA_CODE_PERIOD;
|
||||
enable_dll_pll = true;
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user