From 676c1506da92740a47a8dfc215e7cee892ede367 Mon Sep 17 00:00:00 2001 From: Cillian O'Driscoll Date: Mon, 11 Sep 2017 15:15:27 +0100 Subject: [PATCH 1/4] Updated volk_gnsssdr_module for real codes Added 16i and 32f resamplers and 32fc_32f and 16ic_16i rotator dot product to enable use of real (rather than complex) local code replicas --- .../volk_gnsssdr_avx_intrinsics.h | 8 + .../volk_gnsssdr_16i_resamplerxnpuppet_16i.h | 282 +++ .../volk_gnsssdr_16i_xn_resampler_16i_xn.h | 608 +++++++ ...nsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h | 1600 +++++++++++++++++ ...dr_16ic_16i_rotator_dotprodxnpuppet_16ic.h | 384 ++++ ...volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h | 16 +- .../volk_gnsssdr_32f_resamplerxnpuppet_32f.h | 279 +++ .../volk_gnsssdr_32f_xn_resampler_32f_xn.h | 610 +++++++ ...nsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h | 320 ++++ ...dr_32fc_32f_rotator_dotprodxnpuppet_32fc.h | 132 ++ ...volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h | 41 +- .../volk_gnsssdr/lib/kernel_tests.h | 4 + .../volk_gnsssdr/lib/qa_utils.cc | 2 +- 13 files changed, 4257 insertions(+), 29 deletions(-) create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h create mode 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h index b737ea164..809aa98f9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h @@ -61,6 +61,14 @@ _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){ return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values } +static inline __m256 _mm256_complexnormalise_ps( __m256 z ){ + __m256 tmp1 = _mm256_mul_ps(z, z); + __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm256_sqrt_ps(tmp1); + return _mm256_div_ps(z, tmp2); +} + static inline __m256 _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){ return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h new file mode 100644 index 000000000..3c1c0f817 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h @@ -0,0 +1,282 @@ +/*! + * \file volk_gnsssdr_16i_resamplerxnpuppet_16i.h + * \brief VOLK_GNSSSDR puppet for the multiple 16-bit vector resampler kernel. + * \authors + * + * VOLK_GNSSSDR puppet for integrating the multiple resampler into the test system + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2017 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_16i_resamplerxnpuppet_16i_H +#define INCLUDED_volk_gnsssdr_16i_resamplerxnpuppet_16i_H + +#include "volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h" +#include +#include +#include +#include + +#ifdef LV_HAVE_GENERIC +static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* result, const int16_t* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + unsigned int n; + float rem_code_phase_chips = -0.234; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_16i_xn_resampler_16i_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_SSE3 +static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + +#ifdef LV_HAVE_SSE3 +static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + + +#ifdef LV_HAVE_SSE4_1 +static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + + +#ifdef LV_HAVE_SSE4_1 +static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + + +#ifdef LV_HAVE_AVX +static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, const int16_t* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + + +#ifdef LV_HAVE_AVX +static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, const int16_t* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + + +#ifdef LV_HAVE_NEON +static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_16i_xn_resampler_16i_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + +#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H + diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h new file mode 100644 index 000000000..0d09df273 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h @@ -0,0 +1,608 @@ +/*! + * \file volk_gnsssdr_16i_xn_resampler_16i_xn.h + * \brief VOLK_GNSSSDR kernel: Resamples N 16 bits integer short vectors using zero hold resample algorithm. + * \authors
    + *
  • Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com + *
+ * + * VOLK_GNSSSDR kernel that resamples N 16 bits integer short complex vectors using zero hold resample algorithm. + * It resamples a single GNSS local code signal replica into N vectors fractional-resampled and fractional-delayed + * (i.e. it creates the Early, Prompt, and Late code replicas) + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2017 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +/*! + * \page volk_gnsssdr_16i_xn_resampler_16i_xn + * + * \b Overview + * + * Resamples a complex vector (16-bit integer each component), providing \p num_out_vectors outputs. + * + * Dispatcher Prototype + * \code + * void volk_gnsssdr_16i_xn_resampler_16i_xn(int16_t** result, const int16_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li local_code: Vector to be resampled. + * \li rem_code_phase_chips: Remnant code phase [chips]. + * \li code_phase_step_chips: Phase increment per sample [chips/sample]. + * \li shifts_chips: Vector of floats that defines the spacing (in chips) between the replicas of \p local_code + * \li code_length_chips: Code length in chips. + * \li num_out_vectors: Number of output vectors. + * \li num_points: The number of data values to be in the resampled vector. + * + * \b Outputs + * \li result: Pointer to a vector of pointers where the results will be stored. + * + */ + +#ifndef INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H +#define INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H + +#include +#include +#include +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_generic(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + int local_code_chip_index; + int current_correlator_tap; + int n; + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + for (n = 0; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); + local_code_chip_index = local_code_chip_index % code_length_chips; + result[current_correlator_tap][n] = local_code[local_code_chip_index]; + } + } +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE4_1 +#include +static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + int16_t** _result = result; + const unsigned int quarterPoints = num_points / 4; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m128 fours = _mm_set1_ps(4.0f); + const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); + const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + int local_code_chip_index_; + + const __m128i zeros = _mm_setzero_si128(); + const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips); + const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips); + __m128i local_code_chip_index_reg, aux_i, negatives, i; + __m128 aux, aux2, shifts_chips_reg, c, cTrunc, base; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); + aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); + for(n = 0; n < quarterPoints; n++) + { + aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm_add_ps(aux, aux2); + // floor + aux = _mm_floor_ps(aux); + + // fmod + c = _mm_div_ps(aux, code_length_chips_reg_f); + i = _mm_cvttps_epi32(c); + cTrunc = _mm_cvtepi32_ps(i); + base = _mm_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base)); + + negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros); + aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); + _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm_add_ps(indexn, fours); + } + for(n = quarterPoints * 4; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_SSE4_1 +#include +static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + int16_t** _result = result; + const unsigned int quarterPoints = num_points / 4; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m128 fours = _mm_set1_ps(4.0f); + const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); + const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + int local_code_chip_index_; + + const __m128i zeros = _mm_setzero_si128(); + const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips); + const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips); + __m128i local_code_chip_index_reg, aux_i, negatives, i; + __m128 aux, aux2, shifts_chips_reg, c, cTrunc, base; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); + aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); + for(n = 0; n < quarterPoints; n++) + { + aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm_add_ps(aux, aux2); + // floor + aux = _mm_floor_ps(aux); + + // fmod + c = _mm_div_ps(aux, code_length_chips_reg_f); + i = _mm_cvttps_epi32(c); + cTrunc = _mm_cvtepi32_ps(i); + base = _mm_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base)); + + negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros); + aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); + _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm_add_ps(indexn, fours); + } + for(n = quarterPoints * 4; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_SSE3 +#include +static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + int16_t** _result = result; + const unsigned int quarterPoints = num_points / 4; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m128 ones = _mm_set1_ps(1.0f); + const __m128 fours = _mm_set1_ps(4.0f); + const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); + const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + int local_code_chip_index_; + + const __m128i zeros = _mm_setzero_si128(); + const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips); + const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips); + __m128i local_code_chip_index_reg, aux_i, negatives, i; + __m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); + aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); + for(n = 0; n < quarterPoints; n++) + { + aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm_add_ps(aux, aux2); + // floor + i = _mm_cvttps_epi32(aux); + fi = _mm_cvtepi32_ps(i); + igx = _mm_cmpgt_ps(fi, aux); + j = _mm_and_ps(igx, ones); + aux = _mm_sub_ps(fi, j); + // fmod + c = _mm_div_ps(aux, code_length_chips_reg_f); + i = _mm_cvttps_epi32(c); + cTrunc = _mm_cvtepi32_ps(i); + base = _mm_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base)); + + negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros); + aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); + _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm_add_ps(indexn, fours); + } + for(n = quarterPoints * 4; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_SSE3 +#include +static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + int16_t** _result = result; + const unsigned int quarterPoints = num_points / 4; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m128 ones = _mm_set1_ps(1.0f); + const __m128 fours = _mm_set1_ps(4.0f); + const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); + const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + int local_code_chip_index_; + + const __m128i zeros = _mm_setzero_si128(); + const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips); + const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips); + __m128i local_code_chip_index_reg, aux_i, negatives, i; + __m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); + aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); + for(n = 0; n < quarterPoints; n++) + { + aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm_add_ps(aux, aux2); + // floor + i = _mm_cvttps_epi32(aux); + fi = _mm_cvtepi32_ps(i); + igx = _mm_cmpgt_ps(fi, aux); + j = _mm_and_ps(igx, ones); + aux = _mm_sub_ps(fi, j); + // fmod + c = _mm_div_ps(aux, code_length_chips_reg_f); + i = _mm_cvttps_epi32(c); + cTrunc = _mm_cvtepi32_ps(i); + base = _mm_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base)); + + negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros); + aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); + _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm_add_ps(indexn, fours); + } + for(n = quarterPoints * 4; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_AVX +#include +static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + int16_t** _result = result; + const unsigned int avx_iters = num_points / 8; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m256 eights = _mm256_set1_ps(8.0f); + const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); + const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + int local_code_chip_index_; + + const __m256 zeros = _mm256_setzero_ps(); + const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips); + const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f); + + __m256i local_code_chip_index_reg, i; + __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); + aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + indexn = n0; + for(n = 0; n < avx_iters; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); + aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm256_add_ps(aux, aux2); + // floor + aux = _mm256_floor_ps(aux); + + // fmod + c = _mm256_div_ps(aux, code_length_chips_reg_f); + i = _mm256_cvttps_epi32(c); + cTrunc = _mm256_cvtepi32_ps(i); + base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm256_cvttps_epi32(_mm256_sub_ps(aux, base)); + + // no negatives + c = _mm256_cvtepi32_ps(local_code_chip_index_reg); + negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); + aux = _mm256_add_ps(c, aux3); + local_code_chip_index_reg = _mm256_cvttps_epi32(aux); + + _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 8; ++k) + { + _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm256_add_ps(indexn, eights); + } + } + _mm256_zeroupper(); + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + for(n = avx_iters * 8; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_AVX +#include +static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + int16_t** _result = result; + const unsigned int avx_iters = num_points / 8; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m256 eights = _mm256_set1_ps(8.0f); + const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); + const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + int local_code_chip_index_; + + const __m256 zeros = _mm256_setzero_ps(); + const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips); + const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f); + + __m256i local_code_chip_index_reg, i; + __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); + aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + indexn = n0; + for(n = 0; n < avx_iters; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); + aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm256_add_ps(aux, aux2); + // floor + aux = _mm256_floor_ps(aux); + + // fmod + c = _mm256_div_ps(aux, code_length_chips_reg_f); + i = _mm256_cvttps_epi32(c); + cTrunc = _mm256_cvtepi32_ps(i); + base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm256_cvttps_epi32(_mm256_sub_ps(aux, base)); + + // no negatives + c = _mm256_cvtepi32_ps(local_code_chip_index_reg); + negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); + aux = _mm256_add_ps(c, aux3); + local_code_chip_index_reg = _mm256_cvttps_epi32(aux); + + _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 8; ++k) + { + _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm256_add_ps(indexn, eights); + } + } + _mm256_zeroupper(); + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + for(n = avx_iters * 8; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_NEON +#include +static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + int16_t** _result = result; + const unsigned int neon_iters = num_points / 4; + const int32x4_t ones = vdupq_n_s32(1); + const float32x4_t fours = vdupq_n_f32(4.0f); + const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); + const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + int32_t local_code_chip_index_; + + const int32x4_t zeros = vdupq_n_s32(0); + const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips); + const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); + int32x4_t local_code_chip_index_reg, aux_i, negatives, i; + float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; + __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + uint32x4_t igx; + reciprocal = vrecpeq_f32(code_length_chips_reg_f); + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + float32x4_t n0 = vld1q_f32((float*)vec); + int current_correlator_tap; + unsigned int n; + unsigned int k; + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); + aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); + indexn = n0; + for(n = 0; n < neon_iters; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); + __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); + aux = vmulq_f32(code_phase_step_chips_reg, indexn); + aux = vaddq_f32(aux, aux2); + + //floor + i = vcvtq_s32_f32(aux); + fi = vcvtq_f32_s32(i); + igx = vcgtq_f32(fi, aux); + j = vcvtq_f32_s32(vandq_s32(vreinterpretq_s32_u32(igx), ones)); + aux = vsubq_f32(fi, j); + + // fmod + c = vmulq_f32(aux, reciprocal); + i = vcvtq_s32_f32(c); + cTrunc = vcvtq_f32_s32(i); + base = vmulq_f32(cTrunc, code_length_chips_reg_f); + aux = vsubq_f32(aux, base); + local_code_chip_index_reg = vcvtq_s32_f32(aux); + + negatives = vreinterpretq_s32_u32(vcltq_s32(local_code_chip_index_reg, zeros)); + aux_i = vandq_s32(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = vaddq_s32(local_code_chip_index_reg, aux_i); + + vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); + + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = vaddq_f32(indexn, fours); + } + for(n = neon_iters * 4; n < num_points; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + + +#endif + + +#endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/ + diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h new file mode 100644 index 000000000..230401ccb --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h @@ -0,0 +1,1600 @@ +/*! + * \file volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h + * \brief VOLK_GNSSSDR kernel: multiplies N 16 bits vectors by a common vector + * phase rotated and accumulates the results in N 16 bits short complex outputs. + * \authors
    + *
  • Javier Arribas, 2015. jarribas(at)cttc.es + *
+ * + * VOLK_GNSSSDR kernel that multiplies N 16 bits vectors by a common vector, which is + * phase-rotated by phase offset and phase increment, and accumulates the results + * in N 16 bits short complex outputs. + * It is optimized to perform the N tap correlation process in GNSS receivers. + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +/*! + * \page volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn + * + * \b Overview + * + * Rotates and multiplies the reference complex vector with an arbitrary number of other complex vectors, + * accumulates the results and stores them in the output vector. + * The rotation is done at a fixed rate per sample, from an initial \p phase offset. + * This function can be used for Doppler wipe-off and multiple correlator. + * + * Dispatcher Prototype + * \code + * void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn((lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points); + * \endcode + * + * \b Inputs + * \li in_common: Pointer to one of the vectors to be rotated, multiplied and accumulated (reference vector). + * \li phase_inc: Phase increment = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)) + * \li phase: Initial phase = lv_cmake(cos(initial_phase_rad), sin(initial_phase_rad)) + * \li in_a: Pointer to an array of pointers to multiple vectors to be multiplied and accumulated. + * \li num_a_vectors: Number of vectors to be multiplied by the reference vector and accumulated. + * \li num_points: Number of complex values to be multiplied together, accumulated and stored into \p result. + * + * \b Outputs + * \li phase: Final phase. + * \li result: Vector of \p num_a_vectors components with the multiple vectors of \p in_a rotated, multiplied by \p in_common and accumulated. + * + */ + +#ifndef INCLUDED_volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_H +#define INCLUDED_volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_H + + +#include +#include +#include +#include +#include +//#include + +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_16sc_t tmp16; + lv_32fc_t tmp32; + int n_vec; + unsigned int n; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + result[n_vec] = lv_cmake(0,0); + } + for (n = 0; n < num_points; n++) + { + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + + // Regenerate phase + if (n % 256 == 0) + { + //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); +#ifdef __cplusplus + (*phase) /= std::abs((*phase)); +#else + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); +#endif + //printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + } + + (*phase) *= phase_inc; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; + //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); + } + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_16sc_t tmp16; + lv_32fc_t tmp32; + int n_vec; + unsigned int n; + unsigned int j; + const unsigned int ROTATOR_RELOAD = 256; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + result[n_vec] = lv_cmake(0,0); + } + + for (n = 0; n < num_points / ROTATOR_RELOAD; n++) + { + for (j = 0; j < ROTATOR_RELOAD; j++) + { + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j]; + //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); + } + } + /* Regenerate phase */ +#ifdef __cplusplus + (*phase) /= std::abs((*phase)); +#else + //(*phase) /= cabsf((*phase)); + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); +#endif + } + + for (j = 0; j < num_points % ROTATOR_RELOAD; j++) + { + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + lv_16sc_t tmp = tmp16 * in_a[n_vec][ (num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j ]; + //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); + } + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE3 +#include + +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_16sc_t dotProduct = lv_cmake(0,0); + + const unsigned int sse_iters = num_points / 4; + int n_vec; + int i; + unsigned int number; + unsigned int n; + const int16_t** _in_a = in_a; + const lv_16sc_t* _in_common = in_common; + lv_16sc_t* _out = result; + + __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + + __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + cacc[n_vec] = _mm_setzero_si128(); + } + + __m128i a, b, c; + + // phase rotation registers + __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; + __m128i pc1, pc2; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + two_phase_inc[0] = phase_inc * phase_inc; + two_phase_inc[1] = phase_inc * phase_inc; + two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); + __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_acc[0] = (*phase); + two_phase_acc[1] = (*phase) * phase_inc; + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); + __m128 yl, yh, tmp1, tmp2, tmp3; + lv_16sc_t tmp16; + lv_32fc_t tmp32; + + for(number = 0; number < sse_iters; number++) + { + // Phase rotation on operand in_common starts here: + //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //next two samples + _in_common += 2; + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + __VOLK_GNSSSDR_PREFETCH(_in_common + 8); + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + // store four rotated in_common samples in the register b + b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + + //next two samples + _in_common += 2; + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + + a = _mm_unpacklo_epi16( a, a ); + + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + + cacc[n_vec] = _mm_adds_epi16(cacc[n_vec], c); + } + // Regenerate phase + if ((number % 128) == 0) + { + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + } + } + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + + a = cacc[n_vec]; + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (i = 0; i < 4; ++i) + { + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + } + _out[n_vec] = dotProduct; + } + volk_gnsssdr_free(cacc); + + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); + //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); + (*phase) = two_phase_acc[0]; + + for(n = sse_iters * 4; n < num_points; n++) + { + tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; + //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + } + } +} +#endif /* LV_HAVE_SSE3 */ + + +//#ifdef LV_HAVE_SSE3 +//#include + +//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +//{ + //lv_16sc_t dotProduct = lv_cmake(0,0); + + //const unsigned int sse_iters = num_points / 4; + //const unsigned int ROTATOR_RELOAD = 128; + //int n_vec; + //int i; + //unsigned int number; + //unsigned int j; + //unsigned int n; + + //const int16_t** _in_a = in_a; + //const lv_16sc_t* _in_common = in_common; + //lv_16sc_t* _out = result; + + //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + + //__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); + //__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //realcacc[n_vec] = _mm_setzero_si128(); + //imagcacc[n_vec] = _mm_setzero_si128(); + //} + + //__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl; + + //mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + //mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + + //// phase rotation registers + //__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; + //__m128i pc1, pc2; + //__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + //two_phase_inc[0] = phase_inc * phase_inc; + //two_phase_inc[1] = phase_inc * phase_inc; + //two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); + //__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + //two_phase_acc[0] = (*phase); + //two_phase_acc[1] = (*phase) * phase_inc; + //two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); + //__m128 yl, yh, tmp1, tmp2, tmp3; + //lv_16sc_t tmp16; + //lv_32fc_t tmp32; + + //for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) + //{ + //for (j = 0; j < ROTATOR_RELOAD; j++) + //{ + //// Phase rotation on operand in_common starts here: + ////printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); + //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + ////complex 32fc multiplication b=a*two_phase_acc_reg + //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + + ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + ////next two samples + //_in_common += 2; + //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); + ////complex 32fc multiplication b=a*two_phase_acc_reg + //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + + ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //// store four rotated in_common samples in the register b + //b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + + ////next two samples + //_in_common += 2; + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + + //c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + + //c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + //real = _mm_subs_epi16(c, c_sr); + + //b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + //a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + + //imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + //imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + + //imag = _mm_adds_epi16(imag1, imag2); + + //realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); + //imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); + //} + //} + //// regenerate phase + //tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + //tmp2 = _mm_hadd_ps(tmp1, tmp1); + //tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + //tmp2 = _mm_sqrt_ps(tmp1); + //two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + //} + + //for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) + //{ + //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + ////complex 32fc multiplication b=a*two_phase_acc_reg + //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + + ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + ////next two samples + //_in_common += 2; + //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); + ////complex 32fc multiplication b=a*two_phase_acc_reg + //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + + ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //// store four rotated in_common samples in the register b + //b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + + ////next two samples + //_in_common += 2; + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + + //c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + + //c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + //real = _mm_subs_epi16(c, c_sr); + + //b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + //a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + + //imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + //imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + + //imag = _mm_adds_epi16(imag1, imag2); + + //realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); + //imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); + //} + //} + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real); + //imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag); + + //a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); + + //_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + //dotProduct = lv_cmake(0,0); + //for (i = 0; i < 4; ++i) + //{ + //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), + //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + //} + //_out[n_vec] = dotProduct; + //} + + //volk_gnsssdr_free(realcacc); + //volk_gnsssdr_free(imagcacc); + + //tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + //tmp2 = _mm_hadd_ps(tmp1, tmp1); + //tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + //tmp2 = _mm_sqrt_ps(tmp1); + //two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + + //_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); + ////(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); + //(*phase) = two_phase_acc[0]; + + //for(n = sse_iters * 4; n < num_points; n++) + //{ + //tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + //tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + //tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + //(*phase) *= phase_inc; + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; + ////lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), + //sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + //} + //} + +//} +//#endif [> LV_HAVE_SSE3 <] + + +#ifdef LV_HAVE_SSE3 +#include + +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_16sc_t dotProduct = lv_cmake(0,0); + + const unsigned int sse_iters = num_points / 4; + int n_vec; + int i; + unsigned int number; + unsigned int n; + const int16_t** _in_a = in_a; + const lv_16sc_t* _in_common = in_common; + lv_16sc_t* _out = result; + + __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + + __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + cacc[n_vec] = _mm_setzero_si128(); + } + + __m128i a, b, c; + + // phase rotation registers + __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; + __m128i pc1, pc2; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + two_phase_inc[0] = phase_inc * phase_inc; + two_phase_inc[1] = phase_inc * phase_inc; + two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); + __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_acc[0] = (*phase); + two_phase_acc[1] = (*phase) * phase_inc; + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); + __m128 yl, yh, tmp1, tmp2, tmp3; + lv_16sc_t tmp16; + lv_32fc_t tmp32; + + for(number = 0; number < sse_iters; number++) + { + // Phase rotation on operand in_common starts here: + //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //next two samples + _in_common += 2; + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + __VOLK_GNSSSDR_PREFETCH(_in_common + 8); + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + // store four rotated in_common samples in the register b + b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + + //next two samples + _in_common += 2; + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + + a = _mm_unpacklo_epi16( a, a ); + + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + + cacc[n_vec] = _mm_adds_epi16(cacc[n_vec], c); + } + // Regenerate phase + if ((number % 128) == 0) + { + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + } + } + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + + a = cacc[n_vec]; + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (i = 0; i < 4; ++i) + { + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + } + _out[n_vec] = dotProduct; + } + volk_gnsssdr_free(cacc); + + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); + //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); + (*phase) = two_phase_acc[0]; + + for(n = sse_iters * 4; n < num_points; n++) + { + tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; + //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + } + } +} +#endif /* LV_HAVE_SSE3 */ + + +#ifdef LV_HAVE_AVX2 +#include +#include +#include + +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 8; + const int16_t** _in_a = in_a; + const lv_16sc_t* _in_common = in_common; + lv_16sc_t* _out = result; + int n_vec; + unsigned int number; + unsigned int n; + + lv_16sc_t tmp16; + lv_32fc_t tmp32; + + __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0,0); + + __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + cacc[n_vec] = _mm256_setzero_si256(); + } + + __m128i a128, ain_128, ain_128_lo, ain_128_hi; + __m256i ai; + __m256 a, b; + + __m256 four_phase_acc_reg, four_phase_inc_reg; + + lv_32fc_t _phase_inc = phase_inc*phase_inc*phase_inc*phase_inc; + + // Normalise the 4*phase increment +#ifdef __cplusplus + _phase_inc /= std::abs(_phase_inc); +#else + _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc)); +#endif + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; + for( n = 0; n < 4; ++n ) + { + four_phase_inc[n] = _phase_inc; + four_phase_acc[n] = *phase; + *phase *= phase_inc; + } + four_phase_acc_reg = _mm256_load_ps((float*) four_phase_acc); + four_phase_inc_reg = _mm256_load_ps((float*) four_phase_inc); + + __m256i a2, b2, c, c1, c2, perm_idx; + + perm_idx = _mm256_set_epi32( 7, 6, 3, 2, 5, 4, 1, 0); + //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); + + for(number = 0; number < avx2_iters; number++) + { + a128 = _mm_load_si128( (__m128i *)_in_common ); + ai = _mm256_cvtepi16_epi32( a128 ); + a = _mm256_cvtepi32_ps( ai ); + + //complex 32fc multiplication b=a*two_phase_acc_reg + b = _mm256_complexmul_ps( a, four_phase_acc_reg ); + c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + + //next four samples + _in_common += 4; + a128 = _mm_load_si128( (__m128i *)_in_common ); + ai = _mm256_cvtepi16_epi32( a128 ); + a = _mm256_cvtepi32_ps( ai ); + + //complex 32fc multiplication b=a*two_phase_acc_reg + b = _mm256_complexmul_ps( a, four_phase_acc_reg ); + c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + + __VOLK_GNSSSDR_PREFETCH(_in_common + 16); + + + // Store and convert 32ic to 16ic: + b2 = _mm256_packs_epi32( c1, c2 ); + + b2 = _mm256_permutevar8x32_epi32( b2, perm_idx ); + + + _in_common += 4; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + ain_128 = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 8])); + + ain_128_lo = _mm_unpacklo_epi16( ain_128, ain_128 ); + ain_128_hi = _mm_unpackhi_epi16( ain_128, ain_128 ); + + a2 = _mm256_insertf128_si256( _mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); + + c = _mm256_mullo_epi16(a2, b2); + + cacc[n_vec] = _mm256_adds_epi16(cacc[n_vec], c); + } + // Regenerate phase + if ((number % 128) == 0) + { + four_phase_acc_reg = _mm256_complexnormalise_ps(four_phase_acc_reg); + } + } + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a2 = cacc[n_vec]; + + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (number = 0; number < 8; ++number) + { + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); + } + _out[n_vec] = dotProduct; + } + + volk_gnsssdr_free(cacc); + _mm256_zeroupper(); + + _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); + (*phase) = four_phase_acc[0]; + + for(n = avx2_iters * 8; n < num_points; n++) + { + tmp16 = in_common[n]; + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; + _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + } + } + +} +#endif /* LV_HAVE_AVX2 */ + +#ifdef LV_HAVE_AVX2 +#include +#include +#include + +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 8; + const int16_t** _in_a = in_a; + const lv_16sc_t* _in_common = in_common; + lv_16sc_t* _out = result; + int n_vec; + unsigned int number; + unsigned int n; + + lv_16sc_t tmp16; + lv_32fc_t tmp32; + + __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0,0); + + __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + cacc[n_vec] = _mm256_setzero_si256(); + } + + __m128i a128, ain_128, ain_128_lo, ain_128_hi; + __m256i ai; + __m256 a, b; + + __m256 four_phase_acc_reg, four_phase_inc_reg; + + lv_32fc_t _phase_inc = phase_inc*phase_inc*phase_inc*phase_inc; + + // Normalise the 4*phase increment +#ifdef __cplusplus + _phase_inc /= std::abs(_phase_inc); +#else + _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc)); +#endif + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; + for( n = 0; n < 4; ++n ) + { + four_phase_inc[n] = _phase_inc; + four_phase_acc[n] = *phase; + *phase *= phase_inc; + } + four_phase_acc_reg = _mm256_load_ps((float*) four_phase_acc); + four_phase_inc_reg = _mm256_load_ps((float*) four_phase_inc); + + __m256i a2, b2, c, c1, c2, perm_idx; + + perm_idx = _mm256_set_epi32( 7, 6, 3, 2, 5, 4, 1, 0); + //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); + + for(number = 0; number < avx2_iters; number++) + { + a128 = _mm_loadu_si128( (__m128i *)_in_common ); + ai = _mm256_cvtepi16_epi32( a128 ); + a = _mm256_cvtepi32_ps( ai ); + + //complex 32fc multiplication b=a*two_phase_acc_reg + b = _mm256_complexmul_ps( a, four_phase_acc_reg ); + c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + + //next four samples + _in_common += 4; + a128 = _mm_loadu_si128( (__m128i *)_in_common ); + ai = _mm256_cvtepi16_epi32( a128 ); + a = _mm256_cvtepi32_ps( ai ); + + //complex 32fc multiplication b=a*two_phase_acc_reg + b = _mm256_complexmul_ps( a, four_phase_acc_reg ); + c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + + __VOLK_GNSSSDR_PREFETCH(_in_common + 16); + + + // Store and convert 32ic to 16ic: + b2 = _mm256_packs_epi32( c1, c2 ); + + b2 = _mm256_permutevar8x32_epi32( b2, perm_idx ); + + + _in_common += 4; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + ain_128 = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 8])); + + ain_128_lo = _mm_unpacklo_epi16( ain_128, ain_128 ); + ain_128_hi = _mm_unpackhi_epi16( ain_128, ain_128 ); + + a2 = _mm256_insertf128_si256( _mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); + + c = _mm256_mullo_epi16(a2, b2); + + cacc[n_vec] = _mm256_adds_epi16(cacc[n_vec], c); + } + // Regenerate phase + if ((number % 128) == 0) + { + four_phase_acc_reg = _mm256_complexnormalise_ps(four_phase_acc_reg); + } + } + + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a2 = cacc[n_vec]; + + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (number = 0; number < 8; ++number) + { + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); + } + _out[n_vec] = dotProduct; + } + + volk_gnsssdr_free(cacc); + _mm256_zeroupper(); + + _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); + (*phase) = four_phase_acc[0]; + + for(n = avx2_iters * 8; n < num_points; n++) + { + tmp16 = in_common[n]; + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; + _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + } + } + +} +#endif /* LV_HAVE_AVX2 */ + +//#ifdef LV_HAVE_NEON +//#include + +//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +//{ + //const unsigned int neon_iters = num_points / 4; + + //const int16_t** _in_a = in_a; + //const lv_16sc_t* _in_common = in_common; + //lv_16sc_t* _out = result; + //int n_vec; + //int i; + //unsigned int number; + //unsigned int n; + //lv_16sc_t tmp16_, tmp; + //lv_32fc_t tmp32_; + + //if (neon_iters > 0) + //{ + //lv_16sc_t dotProduct = lv_cmake(0,0); + //float arg_phase0 = cargf(*phase); + //float arg_phase_inc = cargf(phase_inc); + //float phase_est; + + //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; + //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; + //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + + //float32x4_t _phase4_real = vld1q_f32(__phase4_real); + //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); + + //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; + //lv_32fc_t phase3 = phase2 * phase_inc; + //lv_32fc_t phase4 = phase3 * phase_inc; + + //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; + //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + + //float32x4_t _phase_real = vld1q_f32(__phase_real); + //float32x4_t _phase_imag = vld1q_f32(__phase_imag); + + //int16x4x2_t a_val, b_val, c_val; + //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + //float32x4_t half = vdupq_n_f32(0.5f); + //int16x4x2_t tmp16; + //int32x4x2_t tmp32i; + + //float32x4x2_t tmp32f, tmp32_real, tmp32_imag; + //float32x4_t sign, PlusHalf, Round; + + //int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); + + //for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //accumulator[n_vec].val[0] = vdup_n_s16(0); + //accumulator[n_vec].val[1] = vdup_n_s16(0); + //} + + //for(number = 0; number < neon_iters; number++) + //{ + //[> load 4 complex numbers (int 16 bits each component) <] + //tmp16 = vld2_s16((int16_t*)_in_common); + //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); + //_in_common += 4; + + //[> promote them to int 32 bits <] + //tmp32i.val[0] = vmovl_s16(tmp16.val[0]); + //tmp32i.val[1] = vmovl_s16(tmp16.val[1]); + + //[> promote them to float 32 bits <] + //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); + //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); + + //[> complex multiplication of four complex samples (float 32 bits each component) <] + //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); + //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); + //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); + //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); + + //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); + //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); + + //[> downcast results to int32 <] + //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] + //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); + //PlusHalf = vaddq_f32(tmp32f.val[0], half); + //Round = vsubq_f32(PlusHalf, sign); + //tmp32i.val[0] = vcvtq_s32_f32(Round); + + //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); + //PlusHalf = vaddq_f32(tmp32f.val[1], half); + //Round = vsubq_f32(PlusHalf, sign); + //tmp32i.val[1] = vcvtq_s32_f32(Round); + + //[> downcast results to int16 <] + //tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); + //tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); + + //[> compute next four phases <] + //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); + //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); + //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); + //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); + + //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); + //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + ////__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); + + //// multiply the real*real and imag*imag to get real result + //// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r + //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); + //// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i + //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[1]); + //c_val.val[0] = vqsub_s16(b_val.val[0], b_val.val[1]); + + //// Multiply cross terms to get the imaginary result + //// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i + //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[1]); + //// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r + //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); + //c_val.val[1] = vqadd_s16(b_val.val[0], b_val.val[1]); + + //accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], c_val.val[0]); + //accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], c_val.val[1]); + //} + //// Regenerate phase + //if ((number % 256) == 0) + //{ + //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; + + //*phase = lv_cmake(cos(phase_est), sin(phase_est)); + //phase2 = (lv_32fc_t)(*phase) * phase_inc; + //phase3 = phase2 * phase_inc; + //phase4 = phase3 * phase_inc; + + //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; + //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + + //_phase_real = vld1q_f32(____phase_real); + //_phase_imag = vld1q_f32(____phase_imag); + //} + //} + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + //dotProduct = lv_cmake(0,0); + //for (i = 0; i < 4; ++i) + //{ + //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), + //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + //} + //_out[n_vec] = dotProduct; + //} + //volk_gnsssdr_free(accumulator); + //vst1q_f32((float32_t*)__phase_real, _phase_real); + //vst1q_f32((float32_t*)__phase_imag, _phase_imag); + + //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); + //} + + //for (n = neon_iters * 4; n < num_points; n++) + //{ + //tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); + //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); + //(*phase) *= phase_inc; + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //tmp = tmp16_ * in_a[n_vec][n]; + //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + //} + //} +//} + +//#endif [> LV_HAVE_NEON <] + + +//#ifdef LV_HAVE_NEON +//#include +//#include + +//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +//{ + //const unsigned int neon_iters = num_points / 4; + + //const int16_t** _in_a = in_a; + //const lv_16sc_t* _in_common = in_common; + //lv_16sc_t* _out = result; + //int n_vec; + //int i; + //unsigned int number; + //unsigned int n; + //lv_16sc_t tmp16_, tmp; + //lv_32fc_t tmp32_; + + //if (neon_iters > 0) + //{ + //lv_16sc_t dotProduct = lv_cmake(0,0); + //float arg_phase0 = cargf(*phase); + //float arg_phase_inc = cargf(phase_inc); + //float phase_est; + ////printf("arg phase0: %f", arg_phase0); + //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; + //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; + //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + + //float32x4_t _phase4_real = vld1q_f32(__phase4_real); + //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); + + //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; + //lv_32fc_t phase3 = phase2 * phase_inc; + //lv_32fc_t phase4 = phase3 * phase_inc; + + //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; + //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + + //float32x4_t _phase_real = vld1q_f32(__phase_real); + //float32x4_t _phase_imag = vld1q_f32(__phase_imag); + + //int16x4x2_t a_val, b_val; + //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + //float32x4_t half = vdupq_n_f32(0.5f); + //int16x4x2_t tmp16; + //int32x4x2_t tmp32i; + + //float32x4x2_t tmp32f, tmp32_real, tmp32_imag; + //float32x4_t sign, PlusHalf, Round; + + //int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); + + //for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //accumulator[n_vec].val[0] = vdup_n_s16(0); + //accumulator[n_vec].val[1] = vdup_n_s16(0); + //} + + //for(number = 0; number < neon_iters; number++) + //{ + //[> load 4 complex numbers (int 16 bits each component) <] + //tmp16 = vld2_s16((int16_t*)_in_common); + //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); + //_in_common += 4; + + //[> promote them to int 32 bits <] + //tmp32i.val[0] = vmovl_s16(tmp16.val[0]); + //tmp32i.val[1] = vmovl_s16(tmp16.val[1]); + + //[> promote them to float 32 bits <] + //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); + //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); + + //[> complex multiplication of four complex samples (float 32 bits each component) <] + //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); + //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); + //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); + //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); + + //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); + //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); + + //[> downcast results to int32 <] + //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] + //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); + //PlusHalf = vaddq_f32(tmp32f.val[0], half); + //Round = vsubq_f32(PlusHalf, sign); + //tmp32i.val[0] = vcvtq_s32_f32(Round); + + //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); + //PlusHalf = vaddq_f32(tmp32f.val[1], half); + //Round = vsubq_f32(PlusHalf, sign); + //tmp32i.val[1] = vcvtq_s32_f32(Round); + + //[> downcast results to int16 <] + //tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); + //tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); + + //[> compute next four phases <] + //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); + //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); + //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); + //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); + + //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); + //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); + + //// Regenerate phase + //if ((number % 256) == 0) + //{ + ////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); + //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; + ////printf("Estimated phase: %f\n\n", cos(phase_est)); + + //*phase = lv_cmake(cos(phase_est), sin(phase_est)); + //phase2 = (lv_32fc_t)(*phase) * phase_inc; + //phase3 = phase2 * phase_inc; + //phase4 = phase3 * phase_inc; + + //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; + //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + + //_phase_real = vld1q_f32(____phase_real); + //_phase_imag = vld1q_f32(____phase_imag); + + //// Round = vmulq_f32(_phase_real, _phase_real); + //// Round = vmlaq_f32(Round, _phase_imag, _phase_imag); + //// Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); + ////Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); + ////Round = vrecpeq_f32((Round); + //// _phase_real = vdivq_f32(_phase_real, Round); + //// _phase_imag = vdivq_f32(_phase_imag, Round); + ////_phase_real = vmulq_f32(_phase_real, Round); + ////_phase_imag = vmulq_f32(_phase_imag, Round); + ////printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); + + //} + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); + + //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); + //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); + + //// use multiply accumulate/subtract to get result + //b_val.val[0] = vmls_s16(b_val.val[0], a_val.val[1], tmp16.val[1]); + //b_val.val[1] = vmla_s16(b_val.val[1], a_val.val[0], tmp16.val[1]); + + //accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], b_val.val[0]); + //accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], b_val.val[1]); + //} + //} + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + //dotProduct = lv_cmake(0,0); + //for (i = 0; i < 4; ++i) + //{ + //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), + //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + //} + //_out[n_vec] = dotProduct; + //} + //volk_gnsssdr_free(accumulator); + + //vst1q_f32((float32_t*)__phase_real, _phase_real); + //vst1q_f32((float32_t*)__phase_imag, _phase_imag); + + //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); + //} + + //for (n = neon_iters * 4; n < num_points; n++) + //{ + //tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); + //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); + //(*phase) *= phase_inc; + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //tmp = tmp16_ * in_a[n_vec][n]; + //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + //} + //} +//} + +//#endif [> LV_HAVE_NEON <] + + +//#ifdef LV_HAVE_NEON +//#include +//#include + +//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +//{ + //const unsigned int neon_iters = num_points / 4; + + //const int16_t** _in_a = in_a; + //const lv_16sc_t* _in_common = in_common; + //lv_16sc_t* _out = result; + //int n_vec; + //int i; + //unsigned int number; + //unsigned int n; + //lv_16sc_t tmp16_, tmp; + //lv_32fc_t tmp32_; + + //if (neon_iters > 0) + //{ + //lv_16sc_t dotProduct = lv_cmake(0,0); + //float arg_phase0 = cargf(*phase); + //float arg_phase_inc = cargf(phase_inc); + //float phase_est; + + //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; + //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; + //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + + //float32x4_t _phase4_real = vld1q_f32(__phase4_real); + //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); + + //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; + //lv_32fc_t phase3 = phase2 * phase_inc; + //lv_32fc_t phase4 = phase3 * phase_inc; + + //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; + //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + + //float32x4_t _phase_real = vld1q_f32(__phase_real); + //float32x4_t _phase_imag = vld1q_f32(__phase_imag); + + //int16x4x2_t a_val, b_val; + //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + //float32x4_t half = vdupq_n_f32(0.5f); + //int32x4x2_t tmp32i; + + //float32x4x2_t tmp32f, tmp32_real, tmp32_imag; + //float32x4_t sign, PlusHalf, Round; + + //int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); + //int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); + + //for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //accumulator1[n_vec].val[0] = vdup_n_s16(0); + //accumulator1[n_vec].val[1] = vdup_n_s16(0); + //accumulator2[n_vec].val[0] = vdup_n_s16(0); + //accumulator2[n_vec].val[1] = vdup_n_s16(0); + //} + + //for(number = 0; number < neon_iters; number++) + //{ + //[> load 4 complex numbers (int 16 bits each component) <] + //b_val = vld2_s16((int16_t*)_in_common); + //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); + //_in_common += 4; + + //[> promote them to int 32 bits <] + //tmp32i.val[0] = vmovl_s16(b_val.val[0]); + //tmp32i.val[1] = vmovl_s16(b_val.val[1]); + + //[> promote them to float 32 bits <] + //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); + //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); + + //[> complex multiplication of four complex samples (float 32 bits each component) <] + //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); + //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); + //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); + //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); + + //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); + //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); + + //[> downcast results to int32 <] + //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] + //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); + //PlusHalf = vaddq_f32(tmp32f.val[0], half); + //Round = vsubq_f32(PlusHalf, sign); + //tmp32i.val[0] = vcvtq_s32_f32(Round); + + //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); + //PlusHalf = vaddq_f32(tmp32f.val[1], half); + //Round = vsubq_f32(PlusHalf, sign); + //tmp32i.val[1] = vcvtq_s32_f32(Round); + + //[> downcast results to int16 <] + //b_val.val[0] = vqmovn_s32(tmp32i.val[0]); + //b_val.val[1] = vqmovn_s32(tmp32i.val[1]); + + //[> compute next four phases <] + //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); + //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); + //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); + //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); + + //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); + //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); + + //// Regenerate phase + //if ((number % 256) == 0) + //{ + ////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); + //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; + ////printf("Estimated phase: %f\n\n", cos(phase_est)); + + //*phase = lv_cmake(cos(phase_est), sin(phase_est)); + //phase2 = (lv_32fc_t)(*phase) * phase_inc; + //phase3 = phase2 * phase_inc; + //phase4 = phase3 * phase_inc; + + //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; + //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + + //_phase_real = vld1q_f32(____phase_real); + //_phase_imag = vld1q_f32(____phase_imag); + //} + + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); + + //// use 2 accumulators to remove inter-instruction data dependencies + //accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); + //accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); + //accumulator2[n_vec].val[0] = vmls_s16(accumulator2[n_vec].val[0], a_val.val[1], b_val.val[1]); + //accumulator2[n_vec].val[1] = vmla_s16(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]); + //} + //} + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //accumulator1[n_vec].val[0] = vqadd_s16(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]); + //accumulator1[n_vec].val[1] = vqadd_s16(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]); + //} + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector + //dotProduct = lv_cmake(0,0); + //for (i = 0; i < 4; ++i) + //{ + //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), + //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + //} + //_out[n_vec] = dotProduct; + //} + //volk_gnsssdr_free(accumulator1); + //volk_gnsssdr_free(accumulator2); + + //vst1q_f32((float32_t*)__phase_real, _phase_real); + //vst1q_f32((float32_t*)__phase_imag, _phase_imag); + + //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); + //} + + //for (n = neon_iters * 4; n < num_points; n++) + //{ + //tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); + //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); + //(*phase) *= phase_inc; + //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + //{ + //tmp = tmp16_ * in_a[n_vec][n]; + //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + //} + //} +//} + +//#endif [> LV_HAVE_NEON <] + +#endif /*INCLUDED_volk_gnsssdr_16ic_16i_dot_prod_16ic_xn_H*/ + + diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h new file mode 100644 index 000000000..a666c0270 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h @@ -0,0 +1,384 @@ +/*! + * \file volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h + * \brief Volk puppet for the multiple 16-bit complex dot product kernel. + * \authors
    + *
  • Carles Fernandez Prades 2016 cfernandez at cttc dot cat + *
+ * + * Volk puppet for integrating the resampler into volk's test system + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H +#define INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H + +#include "volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h" +#include +#include +#include + +#ifdef LV_HAVE_GENERIC +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + unsigned int n; + int num_a_vectors = 3; + int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_a_vectors; n++) + { + in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + } + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points); + + for(n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // Generic + + +#ifdef LV_HAVE_GENERIC +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + unsigned int n; + int num_a_vectors = 3; + int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_a_vectors; n++) + { + in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + } + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points); + + for(n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // Generic + + +#ifdef LV_HAVE_SSE3 +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + unsigned int n; + int num_a_vectors = 3; + int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_a_vectors; n++) + { + in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + } + + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + + for(n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // SSE3 + + +//#ifdef LV_HAVE_SSE3 +//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +//{ + //// phases must be normalized. Phase rotator expects a complex exponential input! + //float rem_carrier_phase_in_rad = 0.345; + //float phase_step_rad = 0.1; + //lv_32fc_t phase[1]; + //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + //lv_32fc_t phase_inc[1]; + //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + //unsigned int n; + //int num_a_vectors = 3; + //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + //for(n = 0; n < num_a_vectors; n++) + //{ + //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + //} + + //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + + //for(n = 0; n < num_a_vectors; n++) + //{ + //volk_gnsssdr_free(in_a[n]); + //} + //volk_gnsssdr_free(in_a); +//} + +//#endif // SSE3 + + +#ifdef LV_HAVE_SSE3 +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + unsigned int n; + int num_a_vectors = 3; + int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_a_vectors; n++) + { + in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + } + + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + + for(n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // SSE3 + + +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + unsigned int n; + int num_a_vectors = 3; + int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_a_vectors; n++) + { + in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + } + + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + + for(n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX2 + + +//#ifdef LV_HAVE_AVX2 +//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +//{ + //// phases must be normalized. Phase rotator expects a complex exponential input! + //float rem_carrier_phase_in_rad = 0.345; + //float phase_step_rad = 0.1; + //lv_32fc_t phase[1]; + //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + //lv_32fc_t phase_inc[1]; + //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + //unsigned int n; + //int num_a_vectors = 3; + //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + //for(n = 0; n < num_a_vectors; n++) + //{ + //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + //} + + //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + + //for(n = 0; n < num_a_vectors; n++) + //{ + //volk_gnsssdr_free(in_a[n]); + //} + //volk_gnsssdr_free(in_a); +//} + +//#endif // AVX2 + + +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + unsigned int n; + int num_a_vectors = 3; + int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_a_vectors; n++) + { + in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + } + + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + + for(n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX2 + + +//#ifdef LV_HAVE_AVX2 +//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +//{ + //// phases must be normalized. Phase rotator expects a complex exponential input! + //float rem_carrier_phase_in_rad = 0.345; + //float phase_step_rad = 0.1; + //lv_32fc_t phase[1]; + //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + //lv_32fc_t phase_inc[1]; + //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + //unsigned int n; + //int num_a_vectors = 3; + //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + //for(n = 0; n < num_a_vectors; n++) + //{ + //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + //} + + //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + + //for(n = 0; n < num_a_vectors; n++) + //{ + //volk_gnsssdr_free(in_a[n]); + //} + //volk_gnsssdr_free(in_a); +//} + +//#endif // AVX2 + + +//#ifdef LV_HAVE_NEON +//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +//{ + //// phases must be normalized. Phase rotator expects a complex exponential input! + //float rem_carrier_phase_in_rad = 0.345; + //float phase_step_rad = 0.1; + //lv_32fc_t phase[1]; + //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + //lv_32fc_t phase_inc[1]; + //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + //unsigned int n; + //int num_a_vectors = 3; + //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + //for(n = 0; n < num_a_vectors; n++) + //{ + //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + //} + + //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + + //for(n = 0; n < num_a_vectors; n++) + //{ + //volk_gnsssdr_free(in_a[n]); + //} + //volk_gnsssdr_free(in_a); +//} + +//#endif // NEON + + +//#ifdef LV_HAVE_NEON +//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +//{ + //// phases must be normalized. Phase rotator expects a complex exponential input! + //float rem_carrier_phase_in_rad = 0.345; + //float phase_step_rad = 0.1; + //lv_32fc_t phase[1]; + //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + //lv_32fc_t phase_inc[1]; + //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + //unsigned int n; + //int num_a_vectors = 3; + //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + //for(n = 0; n < num_a_vectors; n++) + //{ + //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); + //} + + //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + + //for(n = 0; n < num_a_vectors; n++) + //{ + //volk_gnsssdr_free(in_a[n]); + //} + //volk_gnsssdr_free(in_a); +//} + +//#endif // NEON + +#endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H + + + diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h index 106ad85b7..85e6fcb08 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h @@ -44,8 +44,8 @@ #ifdef LV_HAVE_GENERIC static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; unsigned int n; float rem_code_phase_chips = -0.234; @@ -74,8 +74,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* r #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -103,8 +103,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* re #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -133,8 +133,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* re #ifdef LV_HAVE_SSE4_1 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -163,8 +163,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* #ifdef LV_HAVE_SSE4_1 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -193,8 +193,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* #ifdef LV_HAVE_AVX static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -223,8 +223,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* res #ifdef LV_HAVE_AVX static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -253,8 +253,8 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res #ifdef LV_HAVE_NEON static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h new file mode 100644 index 000000000..cf2a80f52 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h @@ -0,0 +1,279 @@ +/*! + * \file volk_gnsssdr_32f_resamplerxnpuppet_32f.h + * \brief VOLK_GNSSSDR puppet for the multiple 32-bit float vector resampler kernel. + * \authors
    + *
  • Cillian O'Driscoll 2017 cillian.odriscoll at gmail dot com + *
+ * + * VOLK_GNSSSDR puppet for integrating the multiple resampler into the test system + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2017 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_32f_resamplerxnpuppet_32f_H +#define INCLUDED_volk_gnsssdr_32f_resamplerxnpuppet_32f_H + +#include "volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h" +#include +#include +#include +#include + + + +#ifdef LV_HAVE_GENERIC +static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32f_xn_resampler_32f_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + + +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_SSE3 +static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, const float* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + +#ifdef LV_HAVE_SSE3 +static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, const float* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + + +#ifdef LV_HAVE_SSE4_1 +static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result, const float* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + +#ifdef LV_HAVE_SSE4_1 +static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result, const float* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} + +#endif + +#ifdef LV_HAVE_AVX +static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, const float* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} +#endif + + +#ifdef LV_HAVE_AVX +static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, const float* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} +#endif + +#ifdef LV_HAVE_NEON +static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points) +{ + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + int num_out_vectors = 3; + float rem_code_phase_chips = -0.234; + unsigned int n; + float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } + + volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); + + memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); + + for(n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } + volk_gnsssdr_free(result_aux); +} +#endif + +#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H + diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h new file mode 100644 index 000000000..1fa95e0e6 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h @@ -0,0 +1,610 @@ +/*! + * \file volk_gnsssdr_32f_xn_resampler_32f_xn.h + * \brief VOLK_GNSSSDR kernel: Resamples N complex 32-bit float vectors using zero hold resample algorithm. + * \authors
    + *
  • Cillian O'Driscoll, 2017. cillian.odirscoll(at)gmail.com + *
+ * + * VOLK_GNSSSDR kernel that resamples N 32-bit float vectors using zero hold resample algorithm. + * It is optimized to resample a single GNSS local code signal replica into N vectors fractional-resampled and fractional-delayed + * (i.e. it creates the Early, Prompt, and Late code replicas) + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2017 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +/*! + * \page volk_gnsssdr_32f_xn_resampler_32f_xn + * + * \b Overview + * + * Resamples a 32-bit floating point vector , providing \p num_out_vectors outputs. + * + * Dispatcher Prototype + * \code + * void volk_gnsssdr_32f_xn_resampler_32f_xn(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) + * \endcode + * + * \b Inputs + * \li local_code: Vector to be resampled. + * \li rem_code_phase_chips: Remnant code phase [chips]. + * \li code_phase_step_chips: Phase increment per sample [chips/sample]. + * \li shifts_chips: Vector of floats that defines the spacing (in chips) between the replicas of \p local_code + * \li code_length_chips: Code length in chips. + * \li num_out_vectors Number of output vectors. + * \li num_points: The number of data values to be in the resampled vector. + * + * \b Outputs + * \li result: Pointer to a vector of pointers where the results will be stored. + * + */ + +#ifndef INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H +#define INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H + +#include +#include +#include /* abs */ +#include /* int64_t */ +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + int local_code_chip_index; + int current_correlator_tap; + int n; + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + for (n = 0; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); + local_code_chip_index = local_code_chip_index % code_length_chips; + result[current_correlator_tap][n] = local_code[local_code_chip_index]; + } + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE3 +#include +static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + float** _result = result; + const unsigned int quarterPoints = num_points / 4; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m128 ones = _mm_set1_ps(1.0f); + const __m128 fours = _mm_set1_ps(4.0f); + const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); + const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + int local_code_chip_index_; + + const __m128i zeros = _mm_setzero_si128(); + const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips); + const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips); + __m128i local_code_chip_index_reg, aux_i, negatives, i; + __m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); + aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); + for(n = 0; n < quarterPoints; n++) + { + aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm_add_ps(aux, aux2); + // floor + i = _mm_cvttps_epi32(aux); + fi = _mm_cvtepi32_ps(i); + igx = _mm_cmpgt_ps(fi, aux); + j = _mm_and_ps(igx, ones); + aux = _mm_sub_ps(fi, j); + // fmod + c = _mm_div_ps(aux, code_length_chips_reg_f); + i = _mm_cvttps_epi32(c); + cTrunc = _mm_cvtepi32_ps(i); + base = _mm_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base)); + + negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros); + aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); + _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm_add_ps(indexn, fours); + } + for(n = quarterPoints * 4; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_SSE3 +#include +static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + float** _result = result; + const unsigned int quarterPoints = num_points / 4; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m128 ones = _mm_set1_ps(1.0f); + const __m128 fours = _mm_set1_ps(4.0f); + const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); + const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + int local_code_chip_index_; + + const __m128i zeros = _mm_setzero_si128(); + const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips); + const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips); + __m128i local_code_chip_index_reg, aux_i, negatives, i; + __m128 aux, aux2, shifts_chips_reg, fi, igx, j, c, cTrunc, base; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); + aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); + for(n = 0; n < quarterPoints; n++) + { + aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm_add_ps(aux, aux2); + // floor + i = _mm_cvttps_epi32(aux); + fi = _mm_cvtepi32_ps(i); + igx = _mm_cmpgt_ps(fi, aux); + j = _mm_and_ps(igx, ones); + aux = _mm_sub_ps(fi, j); + // fmod + c = _mm_div_ps(aux, code_length_chips_reg_f); + i = _mm_cvttps_epi32(c); + cTrunc = _mm_cvtepi32_ps(i); + base = _mm_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base)); + + negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros); + aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); + _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm_add_ps(indexn, fours); + } + for(n = quarterPoints * 4; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} +#endif + + +#ifdef LV_HAVE_SSE4_1 +#include +static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + float** _result = result; + const unsigned int quarterPoints = num_points / 4; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m128 fours = _mm_set1_ps(4.0f); + const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); + const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + int local_code_chip_index_; + + const __m128i zeros = _mm_setzero_si128(); + const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips); + const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips); + __m128i local_code_chip_index_reg, aux_i, negatives, i; + __m128 aux, aux2, shifts_chips_reg, c, cTrunc, base; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); + aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); + for(n = 0; n < quarterPoints; n++) + { + aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm_add_ps(aux, aux2); + // floor + aux = _mm_floor_ps(aux); + + // fmod + c = _mm_div_ps(aux, code_length_chips_reg_f); + i = _mm_cvttps_epi32(c); + cTrunc = _mm_cvtepi32_ps(i); + base = _mm_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base)); + + negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros); + aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); + _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm_add_ps(indexn, fours); + } + for(n = quarterPoints * 4; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_SSE4_1 +#include +static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + float** _result = result; + const unsigned int quarterPoints = num_points / 4; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m128 fours = _mm_set1_ps(4.0f); + const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); + const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + int local_code_chip_index_; + + const __m128i zeros = _mm_setzero_si128(); + const __m128 code_length_chips_reg_f = _mm_set_ps1((float)code_length_chips); + const __m128i code_length_chips_reg_i = _mm_set1_epi32((int)code_length_chips); + __m128i local_code_chip_index_reg, aux_i, negatives, i; + __m128 aux, aux2, shifts_chips_reg, c, cTrunc, base; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); + aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); + for(n = 0; n < quarterPoints; n++) + { + aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm_add_ps(aux, aux2); + // floor + aux = _mm_floor_ps(aux); + + // fmod + c = _mm_div_ps(aux, code_length_chips_reg_f); + i = _mm_cvttps_epi32(c); + cTrunc = _mm_cvtepi32_ps(i); + base = _mm_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm_cvtps_epi32(_mm_sub_ps(aux, base)); + + negatives = _mm_cmplt_epi32(local_code_chip_index_reg, zeros); + aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); + _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm_add_ps(indexn, fours); + } + for(n = quarterPoints * 4; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_AVX +#include +static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + float** _result = result; + const unsigned int avx_iters = num_points / 8; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m256 eights = _mm256_set1_ps(8.0f); + const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); + const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + int local_code_chip_index_; + + const __m256 zeros = _mm256_setzero_ps(); + const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips); + const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f); + + __m256i local_code_chip_index_reg, i; + __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); + aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + indexn = n0; + for(n = 0; n < avx_iters; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); + aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm256_add_ps(aux, aux2); + // floor + aux = _mm256_floor_ps(aux); + + // fmod + c = _mm256_div_ps(aux, code_length_chips_reg_f); + i = _mm256_cvttps_epi32(c); + cTrunc = _mm256_cvtepi32_ps(i); + base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm256_cvttps_epi32(_mm256_sub_ps(aux, base)); + + // no negatives + c = _mm256_cvtepi32_ps(local_code_chip_index_reg); + negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); + aux = _mm256_add_ps(c, aux3); + local_code_chip_index_reg = _mm256_cvttps_epi32(aux); + + _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 8; ++k) + { + _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm256_add_ps(indexn, eights); + } + } + _mm256_zeroupper(); + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + for(n = avx_iters * 8; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_AVX +#include +static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + float** _result = result; + const unsigned int avx_iters = num_points / 8; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const __m256 eights = _mm256_set1_ps(8.0f); + const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); + const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + int local_code_chip_index_; + + const __m256 zeros = _mm256_setzero_ps(); + const __m256 code_length_chips_reg_f = _mm256_set1_ps((float)code_length_chips); + const __m256 n0 = _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f); + + __m256i local_code_chip_index_reg, i; + __m256 aux, aux2, aux3, shifts_chips_reg, c, cTrunc, base, negatives, indexn; + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); + aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); + indexn = n0; + for(n = 0; n < avx_iters; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); + aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); + aux = _mm256_add_ps(aux, aux2); + // floor + aux = _mm256_floor_ps(aux); + + // fmod + c = _mm256_div_ps(aux, code_length_chips_reg_f); + i = _mm256_cvttps_epi32(c); + cTrunc = _mm256_cvtepi32_ps(i); + base = _mm256_mul_ps(cTrunc, code_length_chips_reg_f); + local_code_chip_index_reg = _mm256_cvttps_epi32(_mm256_sub_ps(aux, base)); + + // no negatives + c = _mm256_cvtepi32_ps(local_code_chip_index_reg); + negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); + aux = _mm256_add_ps(c, aux3); + local_code_chip_index_reg = _mm256_cvttps_epi32(aux); + + _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); + for(k = 0; k < 8; ++k) + { + _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; + } + indexn = _mm256_add_ps(indexn, eights); + } + } + _mm256_zeroupper(); + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + for(n = avx_iters * 8; n < num_points; n++) + { + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + + +#ifdef LV_HAVE_NEON +#include + +static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +{ + float** _result = result; + const unsigned int neon_iters = num_points / 4; + int current_correlator_tap; + unsigned int n; + unsigned int k; + const int32x4_t ones = vdupq_n_s32(1); + const float32x4_t fours = vdupq_n_f32(4.0f); + const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); + const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); + + __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + int32_t local_code_chip_index_; + + const int32x4_t zeros = vdupq_n_s32(0); + const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips); + const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); + int32x4_t local_code_chip_index_reg, aux_i, negatives, i; + float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; + __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + uint32x4_t igx; + reciprocal = vrecpeq_f32(code_length_chips_reg_f); + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + float32x4_t n0 = vld1q_f32((float*)vec); + + for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) + { + shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); + aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); + indexn = n0; + for(n = 0; n < neon_iters; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); + __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); + aux = vmulq_f32(code_phase_step_chips_reg, indexn); + aux = vaddq_f32(aux, aux2); + + //floor + i = vcvtq_s32_f32(aux); + fi = vcvtq_f32_s32(i); + igx = vcgtq_f32(fi, aux); + j = vcvtq_f32_s32(vandq_s32(vreinterpretq_s32_u32(igx), ones)); + aux = vsubq_f32(fi, j); + + // fmod + c = vmulq_f32(aux, reciprocal); + i = vcvtq_s32_f32(c); + cTrunc = vcvtq_f32_s32(i); + base = vmulq_f32(cTrunc, code_length_chips_reg_f); + aux = vsubq_f32(aux, base); + local_code_chip_index_reg = vcvtq_s32_f32(aux); + + negatives = vreinterpretq_s32_u32(vcltq_s32(local_code_chip_index_reg, zeros)); + aux_i = vandq_s32(code_length_chips_reg_i, negatives); + local_code_chip_index_reg = vaddq_s32(local_code_chip_index_reg, aux_i); + + vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); + + for(k = 0; k < 4; ++k) + { + _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; + } + indexn = vaddq_f32(indexn, fours); + } + for(n = neon_iters * 4; n < num_points; n++) + { + __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); + // resample code for current tap + local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); + local_code_chip_index_ = local_code_chip_index_ % code_length_chips; + _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; + } + } +} + +#endif + +#endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/ + + diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h new file mode 100644 index 000000000..544a83990 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h @@ -0,0 +1,320 @@ +/*! + * \file volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h + * \brief VOLK_GNSSSDR kernel: multiplies N complex (32-bit float per component) vectors + * by a common vector, phase rotated and accumulates the results in N float complex outputs. + * \authors
    + *
  • Cillian O'Driscoll 2016. cillian.odriscoll(at)gmail.com + *
+ * + * VOLK_GNSSSDR kernel that multiplies N 32 bits complex vectors by a common vector, which is + * phase-rotated by phase offset and phase increment, and accumulates the results + * in N 32 bits float complex outputs. + * It is optimized to perform the N tap correlation process in GNSS receivers. + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2016 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +/*! + * \page volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn + * + * \b Overview + * + * Rotates and multiplies the reference complex vector with an arbitrary number of other real vectors, + * accumulates the results and stores them in the output vector. + * The rotation is done at a fixed rate per sample, from an initial \p phase offset. + * This function can be used for Doppler wipe-off and multiple correlator. + * + * Dispatcher Prototype + * \code + * void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points); + * \endcode + * + * \b Inputs + * \li in_common: Pointer to one of the vectors to be rotated, multiplied and accumulated (reference vector). + * \li phase_inc: Phase increment = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)) + * \li phase: Initial phase = lv_cmake(cos(initial_phase_rad), sin(initial_phase_rad)) + * \li in_a: Pointer to an array of pointers to multiple vectors to be multiplied and accumulated. + * \li num_a_vectors: Number of vectors to be multiplied by the reference vector and accumulated. + * \li num_points: Number of complex values to be multiplied together, accumulated and stored into \p result. + * + * \b Outputs + * \li phase: Final phase. + * \li result: Vector of \p num_a_vectors components with the multiple vectors of \p in_a rotated, multiplied by \p in_common and accumulated. + * + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H +#define INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H + + +#include +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_32fc_t tmp32_1, tmp32_2; + int n_vec; + unsigned int n; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + result[n_vec] = lv_cmake(0,0); + } + for (n = 0; n < num_points; n++) + { + tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + + // Regenerate phase + if (n % 256 == 0) + { + //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); +#ifdef __cplusplus + (*phase) /= std::abs((*phase)); +#else + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); +#endif + //printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + } + + (*phase) *= phase_inc; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * in_a[n_vec][n]; + result[n_vec] += tmp32_2; + } + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_32fc_t tmp32_1, tmp32_2; + const unsigned int ROTATOR_RELOAD = 256; + int n_vec; + unsigned int n; + unsigned int j; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + result[n_vec] = lv_cmake(0,0); + } + + for (n = 0; n < num_points / ROTATOR_RELOAD; n++) + { + for (j = 0; j < ROTATOR_RELOAD; j++) + { + tmp32_1 = *in_common++ * (*phase); + (*phase) *= phase_inc; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * in_a[n_vec][n * ROTATOR_RELOAD + j]; + result[n_vec] += tmp32_2; + } + } + /* Regenerate phase */ +#ifdef __cplusplus + (*phase) /= std::abs((*phase)); +#else + //(*phase) /= cabsf((*phase)); + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); +#endif + } + + for (j = 0; j < num_points % ROTATOR_RELOAD; j++) + { + tmp32_1 = *in_common++ * (*phase); + (*phase) *= phase_inc; + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; + result[n_vec] += tmp32_2; + } + } +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_AVX +#include +#include +static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const float** in_a, int num_a_vectors, unsigned int num_points) +{ + unsigned int number = 0; + unsigned int vec_ind = 0; + unsigned int i = 0; + const unsigned int sixteenthPoints = num_points / 16; + + const float* aPtr = (float*)in_common; + const float* bPtr[ num_a_vectors]; + for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ){ + bPtr[vec_ind] = in_a[vec_ind]; + } + + lv_32fc_t _phase = (*phase); + lv_32fc_t wo; + + __m256 a0Val, a1Val, a2Val, a3Val; + __m256 b0Val[num_a_vectors], b1Val[num_a_vectors], b2Val[num_a_vectors], b3Val[num_a_vectors]; + __m256 x0Val[num_a_vectors], x1Val[num_a_vectors], x0loVal[num_a_vectors], x0hiVal[num_a_vectors], x1loVal[num_a_vectors], x1hiVal[num_a_vectors]; + __m256 c0Val[num_a_vectors], c1Val[num_a_vectors], c2Val[num_a_vectors], c3Val[num_a_vectors]; + + __m256 dotProdVal0[num_a_vectors]; + __m256 dotProdVal1[num_a_vectors]; + __m256 dotProdVal2[num_a_vectors]; + __m256 dotProdVal3[num_a_vectors]; + + for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ ){ + dotProdVal0[vec_ind] = _mm256_setzero_ps(); + dotProdVal1[vec_ind] = _mm256_setzero_ps(); + dotProdVal2[vec_ind] = _mm256_setzero_ps(); + dotProdVal3[vec_ind] = _mm256_setzero_ps(); + } + + // Set up the complex rotator + __m256 z0, z1, z2, z3; + __attribute__((aligned(32))) lv_32fc_t phase_vec[16]; + for( vec_ind = 0; vec_ind < 16; ++vec_ind ){ + phase_vec[vec_ind] = _phase; + _phase *= phase_inc; + } + + z0 = _mm256_load_ps( (float *)phase_vec ); + z1 = _mm256_load_ps( (float *)(phase_vec + 4) ); + z2 = _mm256_load_ps( (float *)(phase_vec + 8) ); + z3 = _mm256_load_ps( (float *)(phase_vec + 12) ); + + lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16; + + for( vec_ind = 0; vec_ind < 4; ++vec_ind ){ + phase_vec[vec_ind] = dz; + } + + __m256 dz_reg = _mm256_load_ps( (float *)phase_vec ); + dz_reg = _mm256_complexnormalise_ps( dz_reg ); + + for(;number < sixteenthPoints; number++){ + + a0Val = _mm256_loadu_ps(aPtr); + a1Val = _mm256_loadu_ps(aPtr+8); + a2Val = _mm256_loadu_ps(aPtr+16); + a3Val = _mm256_loadu_ps(aPtr+24); + + a0Val = _mm256_complexmul_ps( a0Val, z0 ); + a1Val = _mm256_complexmul_ps( a1Val, z1 ); + a2Val = _mm256_complexmul_ps( a2Val, z2 ); + a3Val = _mm256_complexmul_ps( a3Val, z3 ); + + z0 = _mm256_complexmul_ps( z0, dz_reg ); + z1 = _mm256_complexmul_ps( z1, dz_reg ); + z2 = _mm256_complexmul_ps( z2, dz_reg ); + z3 = _mm256_complexmul_ps( z3, dz_reg ); + + + for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ){ + x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 + x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8); + x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 + x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 + x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]); + x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]); + + // TODO: it may be possible to rearrange swizzling to better pipeline data + b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 + b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 + b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20); + b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31); + + c0Val[vec_ind] = _mm256_mul_ps(a0Val, b0Val[vec_ind]); + c1Val[vec_ind] = _mm256_mul_ps(a1Val, b1Val[vec_ind]); + c2Val[vec_ind] = _mm256_mul_ps(a2Val, b2Val[vec_ind]); + c3Val[vec_ind] = _mm256_mul_ps(a3Val, b3Val[vec_ind]); + + dotProdVal0[vec_ind] = _mm256_add_ps(c0Val[vec_ind], dotProdVal0[vec_ind]); + dotProdVal1[vec_ind] = _mm256_add_ps(c1Val[vec_ind], dotProdVal1[vec_ind]); + dotProdVal2[vec_ind] = _mm256_add_ps(c2Val[vec_ind], dotProdVal2[vec_ind]); + dotProdVal3[vec_ind] = _mm256_add_ps(c3Val[vec_ind], dotProdVal3[vec_ind]); + + bPtr[vec_ind] += 16; + } + + // Force the rotators back onto the unit circle + if ((number % 64) == 0) + { + z0 = _mm256_complexnormalise_ps( z0 ); + z1 = _mm256_complexnormalise_ps( z1 ); + z2 = _mm256_complexnormalise_ps( z2 ); + z3 = _mm256_complexnormalise_ps( z3 ); + } + + aPtr += 32; + } + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + + for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ){ + dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]); + dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]); + dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]); + + _mm256_store_ps((float *)dotProductVector,dotProdVal0[vec_ind]); // Store the results back into the dot product vector + + result[ vec_ind ] = lv_cmake( 0, 0 ); + for( i = 0; i < 4; ++i ){ + result[vec_ind] += dotProductVector[i]; + } + } + + z0 = _mm256_complexnormalise_ps( z0 ); + _mm256_store_ps((float*)phase_vec, z0); + _phase = phase_vec[0]; + _mm256_zeroupper(); + + + number = sixteenthPoints*16; + for(;number < num_points; number++){ + wo = (*aPtr++)*_phase; + _phase *= phase_inc; + + for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ){ + result[vec_ind] += wo * in_a[vec_ind][number]; + } + } + + *phase = _phase; + +} + +#endif /* LV_HAVE_AVX */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */ + + diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h new file mode 100644 index 000000000..a0284f65d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h @@ -0,0 +1,132 @@ +/*! + * \file volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h + * \brief Volk puppet for the multiple 16-bit complex dot product kernel. + * \authors
    + *
  • Carles Fernandez Prades 2016 cfernandez at cttc dot cat + *
+ * + * Volk puppet for integrating the resampler into volk's test system + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H +#define INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H + +#include "volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h" +#include +#include +#include + +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + unsigned int n; + int num_a_vectors = 3; + float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_a_vectors; n++) + { + in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); + } + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + + for(n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} +#endif // Generic + + +#ifdef LV_HAVE_GENERIC +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + unsigned int n; + int num_a_vectors = 3; + float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_a_vectors; n++) + { + in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); + } + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + + for(n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // Generic + +#ifdef LV_HAVE_AVX +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + unsigned int n; + int num_a_vectors = 3; + float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(n = 0; n < num_a_vectors; n++) + { + in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); + } + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + + for(n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX + +#endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H + diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h index f89a1461b..9348c09fc 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h @@ -46,8 +46,8 @@ #ifdef LV_HAVE_GENERIC static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -70,14 +70,15 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r volk_gnsssdr_free(result_aux); } + #endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -105,8 +106,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* re #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -135,8 +136,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re #ifdef LV_HAVE_SSE4_1 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -164,8 +165,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* #ifdef LV_HAVE_SSE4_1 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -193,8 +194,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* #ifdef LV_HAVE_AVX static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -222,8 +223,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* res #ifdef LV_HAVE_AVX static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -251,8 +252,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res #ifdef LV_HAVE_AVX2 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -280,8 +281,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* re #ifdef LV_HAVE_AVX2 static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; @@ -309,8 +310,8 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* re #ifdef LV_HAVE_NEON static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { - float code_phase_step_chips = -0.6; - int code_length_chips = 1023; + int code_length_chips = 2046; + float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h index de3284a06..da0b2f0f8 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h @@ -89,10 +89,14 @@ std::vector init_test_list(volk_gnsssdr_test_params_t (VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerfastpuppet_16ic, volk_gnsssdr_16ic_resampler_fast_16ic, test_params)) (VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic, volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn, test_params)) (VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerxnpuppet_16ic, volk_gnsssdr_16ic_xn_resampler_16ic_xn, test_params)) + (VOLK_INIT_PUPP(volk_gnsssdr_16i_resamplerxnpuppet_16i, volk_gnsssdr_16i_xn_resampler_16i_xn, test_params)) (VOLK_INIT_PUPP(volk_gnsssdr_32fc_resamplerxnpuppet_32fc, volk_gnsssdr_32fc_xn_resampler_32fc_xn, test_params)) + (VOLK_INIT_PUPP(volk_gnsssdr_32f_resamplerxnpuppet_32f, volk_gnsssdr_32f_xn_resampler_32f_xn, test_params)) (VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_dot_prod_16ic_xn, test_params)) (VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16)) + (VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16)) (VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1)) + (VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1)) ; return test_cases; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc index fc0518a65..8019b3a8a 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc @@ -717,7 +717,7 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, { if(both_sigs[j].is_signed) { - fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i); + fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i); } else { From e87522880e290652c3a53e29a1add2670a9eab92 Mon Sep 17 00:00:00 2001 From: Cillian O'Driscoll Date: Mon, 11 Sep 2017 15:21:05 +0100 Subject: [PATCH 2/4] Added ability to generate real valued codes Only done for GPS L1 C/A and Galileo E1 OS for now. Also added a cpu_multicorrelator_real_codes class that performs code correlation using real-valued local codes --- .../libs/galileo_e1_signal_processing.cc | 83 ++++++--- .../libs/galileo_e1_signal_processing.h | 28 +-- src/algorithms/libs/gnss_signal_processing.cc | 22 +++ src/algorithms/libs/gnss_signal_processing.h | 7 + .../libs/gps_sdr_signal_processing.cc | 33 +++- .../libs/gps_sdr_signal_processing.h | 6 + src/algorithms/tracking/libs/CMakeLists.txt | 1 + .../libs/cpu_multicorrelator_real_codes.cc | 147 +++++++++++++++ .../libs/cpu_multicorrelator_real_codes.h | 70 +++++++ .../cpu_multicorrelator_real_codes_test.cc | 174 ++++++++++++++++++ .../tracking/cpu_multicorrelator_test.cc | 3 + 11 files changed, 530 insertions(+), 44 deletions(-) create mode 100644 src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc create mode 100644 src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h create mode 100644 src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc diff --git a/src/algorithms/libs/galileo_e1_signal_processing.cc b/src/algorithms/libs/galileo_e1_signal_processing.cc index 93fef2f50..46738faad 100644 --- a/src/algorithms/libs/galileo_e1_signal_processing.cc +++ b/src/algorithms/libs/galileo_e1_signal_processing.cc @@ -73,7 +73,7 @@ void galileo_e1_code_gen_int(int* _dest, char _Signal[3], signed int _prn) -void galileo_e1_sinboc_11_gen(std::complex* _dest, int* _prn, unsigned int _length_out) +void galileo_e1_sinboc_11_gen_int(int* _dest, int* _prn, unsigned int _length_out) { const unsigned int _length_in = Galileo_E1_B_CODE_LENGTH_CHIPS; unsigned int _period = static_cast( _length_out / _length_in ); @@ -81,18 +81,18 @@ void galileo_e1_sinboc_11_gen(std::complex* _dest, int* _prn, unsigned in { for (unsigned int j = 0; j < (_period / 2); j++) { - _dest[i * _period + j] = std::complex(static_cast(_prn[i]), 0.0); + _dest[i * _period + j] = _prn[i]; } for (unsigned int j = (_period / 2); j < _period; j++) { - _dest[i * _period + j] = std::complex(static_cast(- _prn[i]), 0.0); + _dest[i * _period + j] = - _prn[i]; } } } -void galileo_e1_sinboc_61_gen(std::complex* _dest, int* _prn, unsigned int _length_out) +void galileo_e1_sinboc_61_gen_int(int* _dest, int* _prn, unsigned int _length_out) { const unsigned int _length_in = Galileo_E1_B_CODE_LENGTH_CHIPS; unsigned int _period = static_cast(_length_out / _length_in); @@ -101,42 +101,44 @@ void galileo_e1_sinboc_61_gen(std::complex* _dest, int* _prn, unsigned in { for (unsigned int j = 0; j < _period; j += 2) { - _dest[i * _period + j] = std::complex(static_cast(_prn[i]), 0.0); + _dest[i * _period + j] = _prn[i]; } for (unsigned int j = 1; j < _period; j += 2) { - _dest[i * _period + j] = std::complex(static_cast(- _prn[i]), 0.0); + _dest[i * _period + j] = - _prn[i]; } } } -void galileo_e1_gen(std::complex* _dest, int* _prn, char _Signal[3]) +void galileo_e1_gen_float(float* _dest, int* _prn, char _Signal[3]) { std::string _galileo_signal = _Signal; const unsigned int _codeLength = 12 * Galileo_E1_B_CODE_LENGTH_CHIPS; const float alpha = sqrt(10.0 / 11.0); const float beta = sqrt(1.0 / 11.0); - std::complex sinboc_11[12 * 4092]; // _codeLength not accepted by Clang - std::complex sinboc_61[12 * 4092]; + int sinboc_11[12 * 4092]; // _codeLength not accepted by Clang + int sinboc_61[12 * 4092]; - galileo_e1_sinboc_11_gen(sinboc_11, _prn, _codeLength); //generate sinboc(1,1) 12 samples per chip - galileo_e1_sinboc_61_gen(sinboc_61, _prn, _codeLength); //generate sinboc(6,1) 12 samples per chip + galileo_e1_sinboc_11_gen_int(sinboc_11, _prn, _codeLength); //generate sinboc(1,1) 12 samples per chip + galileo_e1_sinboc_61_gen_int(sinboc_61, _prn, _codeLength); //generate sinboc(6,1) 12 samples per chip if (_galileo_signal.rfind("1B") != std::string::npos && _galileo_signal.length() >= 2) { for (unsigned int i = 0; i < _codeLength; i++) { - _dest[i] = alpha * sinboc_11[i] + beta * sinboc_61[i]; + _dest[i] = alpha * static_cast(sinboc_11[i]) + + beta * static_cast(sinboc_61[i]); } } else if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2) { for (unsigned int i = 0; i < _codeLength; i++) { - _dest[i] = alpha * sinboc_11[i] - beta * sinboc_61[i]; + _dest[i] = alpha * static_cast(sinboc_11[i]) - + beta * static_cast(sinboc_61[i]); } } else @@ -144,8 +146,7 @@ void galileo_e1_gen(std::complex* _dest, int* _prn, char _Signal[3]) } - -void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signal[3], +void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3], bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift, bool _secondary_flag) { @@ -164,23 +165,29 @@ void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signa galileo_e1_code_gen_int(primary_code_E1_chips, _Signal, _prn); //generate Galileo E1 code, 1 sample per chip - std::complex* _signal_E1; + float* _signal_E1; _codeLength = _samplesPerChip * Galileo_E1_B_CODE_LENGTH_CHIPS; - _signal_E1 = new std::complex[_codeLength]; + _signal_E1 = new float[_codeLength]; if (_cboc == true) { - galileo_e1_gen(_signal_E1, primary_code_E1_chips, _Signal); //generate cboc 12 samples per chip + galileo_e1_gen_float(_signal_E1, primary_code_E1_chips, _Signal); //generate cboc 12 samples per chip } else { - galileo_e1_sinboc_11_gen(_signal_E1, primary_code_E1_chips, _codeLength); //generate sinboc(1,1) 2 samples per chip + int _signal_E1_int[_codeLength]; + galileo_e1_sinboc_11_gen_int(_signal_E1_int, primary_code_E1_chips, _codeLength); //generate sinboc(1,1) 2 samples per chip + + for( unsigned int ii = 0; ii < _codeLength; ++ii ) + { + _signal_E1[ii] = static_cast< float >( _signal_E1_int[ii] ); + } } if (_fs != _samplesPerChip * _codeFreqBasis) { - std::complex* _resampled_signal = new std::complex[_samplesPerCode]; + float* _resampled_signal = new float[_samplesPerCode]; resampler(_signal_E1, _resampled_signal, _samplesPerChip * _codeFreqBasis, _fs, _codeLength, _samplesPerCode); //resamples code to fs @@ -192,8 +199,8 @@ void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signa if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2 && _secondary_flag) { - std::complex* _signal_E1C_secondary = new std::complex - [static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH) + float* _signal_E1C_secondary = new float[ + static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH) * _samplesPerCode]; for (unsigned int i = 0; i < static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH); i++) @@ -202,7 +209,7 @@ void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signa { _signal_E1C_secondary[i*_samplesPerCode + k] = _signal_E1[k] * (Galileo_E1_C_SECONDARY_CODE.at(i) == '0' - ? std::complex(1,0) : std::complex(-1,0)); + ? 1.0f : -1.0f); } } @@ -220,6 +227,36 @@ void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signa delete[] _signal_E1; } +void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signal[3], + bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift, + bool _secondary_flag) +{ + std::string _galileo_signal = _Signal; + const int _codeFreqBasis = Galileo_E1_CODE_CHIP_RATE_HZ; //Hz + unsigned int _samplesPerCode = static_cast( + static_cast(_fs) / (static_cast(_codeFreqBasis ) + / static_cast(Galileo_E1_B_CODE_LENGTH_CHIPS))); + + if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2 && _secondary_flag) + { + _samplesPerCode *= static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH); + } + + float real_code[_samplesPerCode]; + + galileo_e1_code_gen_float_sampled( real_code, _Signal, _cboc, _prn, _fs, _chip_shift, _secondary_flag ); + + for( unsigned int ii = 0; ii < _samplesPerCode; ++ii ) + { + _dest[ii] = std::complex< float >( real_code[ii], 0.0f ); + } +} + +void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3], + bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift) +{ + galileo_e1_code_gen_float_sampled(_dest, _Signal, _cboc, _prn, _fs, _chip_shift, false); +} void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signal[3], bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift) diff --git a/src/algorithms/libs/galileo_e1_signal_processing.h b/src/algorithms/libs/galileo_e1_signal_processing.h index d7518ab04..f7c670129 100644 --- a/src/algorithms/libs/galileo_e1_signal_processing.h +++ b/src/algorithms/libs/galileo_e1_signal_processing.h @@ -36,30 +36,22 @@ /*! - * \brief This function generates Galileo E1 code (one sample per chip). + * \brief This function generates Galileo E1 code (can select E1B or E1C, cboc or sinboc + * and the sample frequency _fs). * */ -void galileo_e1_code_gen_int(int* _dest, char _Signal[3], signed int _prn); +void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3], + bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift, + bool _secondary_flag); /*! - * \brief This function generates Galileo E1 sinboc(1,1) code (minimum 2 samples per chip), - * the _codeLength variable must be a multiple of 2*4092. + * \brief This function generates Galileo E1 code (can select E1B or E1C, cboc or sinboc + * and the sample frequency _fs). * */ -void galileo_e1_sinboc_11_gen(std::complex* _dest, int* _prn, - unsigned int _codeLength); -/*! - * \brief This function generates Galileo E1 sinboc(6,1) code (minimum 12 samples per chip), - * the _codeLength variable must be a multiple of 12*4092. - * - */ -void galileo_e1_sinboc_61_gen(std::complex* _dest, int* _prn, - unsigned int _codeLength); -/*! - * \brief This function generates Galileo E1 cboc code (12 samples per chip). - * - */ -void galileo_e1_cboc_gen(std::complex* _dest, int* _prn, char _Signal[3]); +void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3], + bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift); + /*! * \brief This function generates Galileo E1 code (can select E1B or E1C, cboc or sinboc * and the sample frequency _fs). diff --git a/src/algorithms/libs/gnss_signal_processing.cc b/src/algorithms/libs/gnss_signal_processing.cc index 86738f062..9666b1d0d 100644 --- a/src/algorithms/libs/gnss_signal_processing.cc +++ b/src/algorithms/libs/gnss_signal_processing.cc @@ -156,6 +156,28 @@ void hex_to_binary_converter(int * _dest, char _from) } } +void resampler(float* _from, float* _dest, float _fs_in, + float _fs_out, unsigned int _length_in, unsigned int _length_out) +{ + unsigned int _codeValueIndex; + float aux; + //--- Find time constants -------------------------------------------------- + const float _t_in = 1 / _fs_in; // Incoming sampling period in sec + const float _t_out = 1 / _fs_out; // Out sampling period in sec + for (unsigned int i = 0; i < _length_out - 1; i++) + { + //=== Digitizing ======================================================= + //--- compute index array to read sampled values ------------------------- + //_codeValueIndex = ceil((_t_out * ((float)i + 1)) / _t_in) - 1; + aux = (_t_out * (i + 1)) / _t_in; + _codeValueIndex = auxCeil2(aux) - 1; + + //if repeat the chip -> upsample by nearest neighborhood interpolation + _dest[i] = _from[_codeValueIndex]; + } + //--- Correct the last index (due to number rounding issues) ----------- + _dest[_length_out - 1] = _from[_length_in - 1]; +} void resampler(std::complex* _from, std::complex* _dest, float _fs_in, float _fs_out, unsigned int _length_in, unsigned int _length_out) diff --git a/src/algorithms/libs/gnss_signal_processing.h b/src/algorithms/libs/gnss_signal_processing.h index 2449dfaea..b75cb71c5 100644 --- a/src/algorithms/libs/gnss_signal_processing.h +++ b/src/algorithms/libs/gnss_signal_processing.h @@ -60,6 +60,13 @@ void complex_exp_gen_conj(std::complex* _dest, double _f, double _fs, */ void hex_to_binary_converter(int * _dest, char _from); +/*! + * \brief This function resamples a sequence of float values. + * + */ +void resampler(float* _from, float* _dest, + float _fs_in, float _fs_out, unsigned int _length_in, + unsigned int _length_out); /*! * \brief This function resamples a sequence of complex values. * diff --git a/src/algorithms/libs/gps_sdr_signal_processing.cc b/src/algorithms/libs/gps_sdr_signal_processing.cc index a88ea5119..168f8af97 100644 --- a/src/algorithms/libs/gps_sdr_signal_processing.cc +++ b/src/algorithms/libs/gps_sdr_signal_processing.cc @@ -34,7 +34,7 @@ auto auxCeil = [](float x){ return static_cast(static_cast((x)+1)); }; -void gps_l1_ca_code_gen_complex(std::complex* _dest, signed int _prn, unsigned int _chip_shift) +void gps_l1_ca_code_gen_int(int* _dest, signed int _prn, unsigned int _chip_shift) { const unsigned int _code_length = 1023; bool G1[_code_length]; @@ -102,17 +102,44 @@ void gps_l1_ca_code_gen_complex(std::complex* _dest, signed int _prn, uns aux = G1[(lcv + _chip_shift) % _code_length]^G2[delay]; if(aux == true) { - _dest[lcv] = std::complex(1, 0); + _dest[lcv] = 1; } else { - _dest[lcv] = std::complex(-1, 0); + _dest[lcv] = -1; } delay++; delay %= _code_length; } } +void gps_l1_ca_code_gen_float(float* _dest, signed int _prn, unsigned int _chip_shift) +{ + unsigned int _code_length = 1023; + int ca_code_int[ _code_length ]; + + gps_l1_ca_code_gen_int( ca_code_int, _prn, _chip_shift ); + + for( unsigned int ii = 0; ii < _code_length; ++ii ) + { + _dest[ii] = static_cast( ca_code_int[ii] ); + } + +} + +void gps_l1_ca_code_gen_complex(std::complex* _dest, signed int _prn, unsigned int _chip_shift) +{ + unsigned int _code_length = 1023; + int ca_code_int[ _code_length ]; + + gps_l1_ca_code_gen_int( ca_code_int, _prn, _chip_shift ); + + for( unsigned int ii = 0; ii < _code_length; ++ii ) + { + _dest[ii] = std::complex< float >( static_cast(ca_code_int[ii]), 0.0f ); + } + +} /* diff --git a/src/algorithms/libs/gps_sdr_signal_processing.h b/src/algorithms/libs/gps_sdr_signal_processing.h index 2f01b5647..bd41ca8a3 100644 --- a/src/algorithms/libs/gps_sdr_signal_processing.h +++ b/src/algorithms/libs/gps_sdr_signal_processing.h @@ -35,6 +35,12 @@ #include +//!Generates int GPS L1 C/A code for the desired SV ID and code shift +void gps_l1_ca_code_gen_int(int* _dest, signed int _prn, unsigned int _chip_shift); + +//!Generates float GPS L1 C/A code for the desired SV ID and code shift +void gps_l1_ca_code_gen_float(float* _dest, signed int _prn, unsigned int _chip_shift); + //!Generates complex GPS L1 C/A code for the desired SV ID and code shift, and sampled to specific sampling frequency void gps_l1_ca_code_gen_complex(std::complex* _dest, signed int _prn, unsigned int _chip_shift); diff --git a/src/algorithms/tracking/libs/CMakeLists.txt b/src/algorithms/tracking/libs/CMakeLists.txt index 88236920a..19387889e 100644 --- a/src/algorithms/tracking/libs/CMakeLists.txt +++ b/src/algorithms/tracking/libs/CMakeLists.txt @@ -33,6 +33,7 @@ endif(ENABLE_CUDA) set(TRACKING_LIB_SOURCES cpu_multicorrelator.cc + cpu_multicorrelator_real_codes.cc cpu_multicorrelator_16sc.cc lock_detectors.cc tcp_communication.cc diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc new file mode 100644 index 000000000..cc0df391c --- /dev/null +++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc @@ -0,0 +1,147 @@ +/*! + * \file cpu_multicorrelator_real_codes.cc + * \brief High optimized CPU vector multiTAP correlator class with real-valued local codes + * \authors
    + *
  • Javier Arribas, 2015. jarribas(at)cttc.es + *
  • Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com + *
+ * + * Class that implements a high optimized vector multiTAP correlator class for CPUs + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2017 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#include "cpu_multicorrelator_real_codes.h" +#include +#include +#include + + +cpu_multicorrelator_real_codes::cpu_multicorrelator_real_codes() +{ + d_sig_in = nullptr; + d_local_code_in = nullptr; + d_shifts_chips = nullptr; + d_corr_out = nullptr; + d_local_codes_resampled = nullptr; + d_code_length_chips = 0; + d_n_correlators = 0; +} + + +cpu_multicorrelator_real_codes::~cpu_multicorrelator_real_codes() +{ + if(d_local_codes_resampled != nullptr) + { + cpu_multicorrelator_real_codes::free(); + } +} + + +bool cpu_multicorrelator_real_codes::init( + int max_signal_length_samples, + int n_correlators) +{ + // ALLOCATE MEMORY FOR INTERNAL vectors + size_t size = max_signal_length_samples * sizeof(float); + + d_local_codes_resampled = static_cast(volk_gnsssdr_malloc(n_correlators * sizeof(float*), volk_gnsssdr_get_alignment())); + for (int n = 0; n < n_correlators; n++) + { + d_local_codes_resampled[n] = static_cast(volk_gnsssdr_malloc(size, volk_gnsssdr_get_alignment())); + } + d_n_correlators = n_correlators; + return true; +} + + + +bool cpu_multicorrelator_real_codes::set_local_code_and_taps( + int code_length_chips, + const float* local_code_in, + float *shifts_chips) +{ + d_local_code_in = local_code_in; + d_shifts_chips = shifts_chips; + d_code_length_chips = code_length_chips; + return true; +} + + +bool cpu_multicorrelator_real_codes::set_input_output_vectors(std::complex* corr_out, const std::complex* sig_in) +{ + // Save CPU pointers + d_sig_in = sig_in; + d_corr_out = corr_out; + return true; +} + + +void cpu_multicorrelator_real_codes::update_local_code(int correlator_length_samples, float rem_code_phase_chips, float code_phase_step_chips) +{ + volk_gnsssdr_32f_xn_resampler_32f_xn(d_local_codes_resampled, + d_local_code_in, + rem_code_phase_chips, + code_phase_step_chips, + d_shifts_chips, + d_code_length_chips, + d_n_correlators, + correlator_length_samples); +} + + +bool cpu_multicorrelator_real_codes::Carrier_wipeoff_multicorrelator_resampler( + float rem_carrier_phase_in_rad, + float phase_step_rad, + float rem_code_phase_chips, + float code_phase_step_chips, + int signal_length_samples) +{ + update_local_code(signal_length_samples, rem_code_phase_chips, code_phase_step_chips); + // Regenerate phase at each call in order to avoid numerical issues + lv_32fc_t phase_offset_as_complex[1]; + phase_offset_as_complex[0] = lv_cmake(std::cos(rem_carrier_phase_in_rad), -std::sin(rem_carrier_phase_in_rad)); + // call VOLK_GNSSSDR kernel + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn(d_corr_out, d_sig_in, std::exp(lv_32fc_t(0, - phase_step_rad)), phase_offset_as_complex, (const float**)d_local_codes_resampled, d_n_correlators, signal_length_samples); + return true; +} + + +bool cpu_multicorrelator_real_codes::free() +{ + // Free memory + if (d_local_codes_resampled != nullptr) + { + for (int n = 0; n < d_n_correlators; n++) + { + volk_gnsssdr_free(d_local_codes_resampled[n]); + } + volk_gnsssdr_free(d_local_codes_resampled); + d_local_codes_resampled = nullptr; + } + return true; +} + + diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h new file mode 100644 index 000000000..922a12c7a --- /dev/null +++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h @@ -0,0 +1,70 @@ +/*! + * \file cpu_multicorrelator_real_codes.h + * \brief High optimized CPU vector multiTAP correlator class using real-valued local codes + * \authors
    + *
  • Javier Arribas, 2015. jarribas(at)cttc.es + *
  • Cillian O'Driscoll, 2017, cillian.odriscoll(at)gmail.com + *
+ * + * Class that implements a high optimized vector multiTAP correlator class for CPUs + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2017 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef GNSS_SDR_CPU_MULTICORRELATOR_REAL_CODES_H_ +#define GNSS_SDR_CPU_MULTICORRELATOR_REAL_CODES_H_ + + +#include + +/*! + * \brief Class that implements carrier wipe-off and correlators. + */ +class cpu_multicorrelator_real_codes +{ +public: + cpu_multicorrelator_real_codes(); + ~cpu_multicorrelator_real_codes(); + bool init(int max_signal_length_samples, int n_correlators); + bool set_local_code_and_taps(int code_length_chips, const float* local_code_in, float *shifts_chips); + bool set_input_output_vectors(std::complex* corr_out, const std::complex* sig_in); + void update_local_code(int correlator_length_samples, float rem_code_phase_chips, float code_phase_step_chips); + bool Carrier_wipeoff_multicorrelator_resampler(float rem_carrier_phase_in_rad, float phase_step_rad, float rem_code_phase_chips, float code_phase_step_chips, int signal_length_samples); + bool free(); + +private: + // Allocate the device input vectors + const std::complex *d_sig_in; + float **d_local_codes_resampled; + const float *d_local_code_in; + std::complex *d_corr_out; + float *d_shifts_chips; + int d_code_length_chips; + int d_n_correlators; +}; + + +#endif /* CPU_MULTICORRELATOR_REAL_CODES_H_ */ + diff --git a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc new file mode 100644 index 000000000..100cc6985 --- /dev/null +++ b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc @@ -0,0 +1,174 @@ +/*! + * \file cpu_multicorrelator_real_codes_test.cc + * \brief This file implements timing tests for the FFT. + * \author Carles Fernandez-Prades, 2016. cfernandez(at)cttc.es + * + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2016 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cpu_multicorrelator_real_codes.h" +#include "gps_sdr_signal_processing.h" +#include "GPS_L1_CA.h" + + +DEFINE_int32(cpu_multicorrelator_real_codes_iterations_test, 1000, "Number of averaged iterations in CPU multicorrelator test timing test"); +DEFINE_int32(cpu_multicorrelator_real_codes_max_threads_test, 12, "Number of maximum concurrent correlators in CPU multicorrelator test timing test"); + +void run_correlator_cpu(cpu_multicorrelator_real_codes* correlator, + float d_rem_carrier_phase_rad, + float d_carrier_phase_step_rad, + float d_code_phase_step_chips, + float d_rem_code_phase_chips, + int correlation_size) +{ + for(int k = 0; k < FLAGS_cpu_multicorrelator_real_codes_iterations_test; k++) + { + correlator->Carrier_wipeoff_multicorrelator_resampler(d_rem_carrier_phase_rad, + d_carrier_phase_step_rad, + d_code_phase_step_chips, + d_rem_code_phase_chips, + correlation_size); + } +} + + +TEST(CpuMulticorrelatorRealCodesTest, MeasureExecutionTime) +{ + std::chrono::time_point start, end; + std::chrono::duration elapsed_seconds(0); + int max_threads = FLAGS_cpu_multicorrelator_real_codes_max_threads_test; + std::vector thread_pool; + cpu_multicorrelator_real_codes* correlator_pool[max_threads]; + unsigned int correlation_sizes [3] = { 2048, 4096, 8192}; + double execution_times [3]; + + float* d_ca_code; + gr_complex* in_cpu; + gr_complex* d_correlator_outs; + + int d_n_correlator_taps = 3; + int d_vector_length = correlation_sizes[2]; //max correlation size to allocate all the necessary memory + float* d_local_code_shift_chips; + + //allocate host memory + // Get space for a vector with the C/A code replica sampled 1x/chip + d_ca_code = static_cast(volk_gnsssdr_malloc(static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(float), volk_gnsssdr_get_alignment())); + in_cpu = static_cast(volk_gnsssdr_malloc(2 * d_vector_length * sizeof(gr_complex), volk_gnsssdr_get_alignment())); + + // correlator outputs (scalar) + d_n_correlator_taps = 3; // Early, Prompt, and Late + d_correlator_outs = static_cast(volk_gnsssdr_malloc(d_n_correlator_taps*sizeof(gr_complex), volk_gnsssdr_get_alignment())); + for (int n = 0; n < d_n_correlator_taps; n++) + { + d_correlator_outs[n] = gr_complex(0,0); + } + d_local_code_shift_chips = static_cast(volk_gnsssdr_malloc(d_n_correlator_taps*sizeof(float), volk_gnsssdr_get_alignment())); + // Set TAPs delay values [chips] + float d_early_late_spc_chips = 0.5; + d_local_code_shift_chips[0] = - d_early_late_spc_chips; + d_local_code_shift_chips[1] = 0.0; + d_local_code_shift_chips[2] = d_early_late_spc_chips; + + //--- Perform initializations ------------------------------ + + //local code resampler on GPU + // generate local reference (1 sample per chip) + gps_l1_ca_code_gen_float(d_ca_code, 1, 0); + // generate inut signal + std::random_device r; + std::default_random_engine e1(r()); + std::uniform_real_distribution uniform_dist(0, 1); + for (int n = 0; n < 2 * d_vector_length; n++) + { + in_cpu[n] = std::complex(uniform_dist(e1), uniform_dist(e1)); + } + + for (int n = 0; n < max_threads; n++) + { + correlator_pool[n] = new cpu_multicorrelator_real_codes(); + correlator_pool[n]->init(d_vector_length, d_n_correlator_taps); + correlator_pool[n]->set_input_output_vectors(d_correlator_outs, in_cpu); + correlator_pool[n]->set_local_code_and_taps(static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS), d_ca_code, d_local_code_shift_chips); + } + + float d_rem_carrier_phase_rad = 0.0; + float d_carrier_phase_step_rad = 0.1; + float d_code_phase_step_chips = 0.3; + float d_rem_code_phase_chips = 0.4; + + EXPECT_NO_THROW( + for(int correlation_sizes_idx = 0; correlation_sizes_idx < 3; correlation_sizes_idx++) + { + for(int current_max_threads = 1; current_max_threads < (max_threads+1); current_max_threads++) + { + std::cout << "Running " << current_max_threads << " concurrent correlators" << std::endl; + start = std::chrono::system_clock::now(); + //create the concurrent correlator threads + for (int current_thread = 0; current_thread < current_max_threads; current_thread++) + { + thread_pool.push_back(std::thread(run_correlator_cpu, + correlator_pool[current_thread], + d_rem_carrier_phase_rad, + d_carrier_phase_step_rad, + d_code_phase_step_chips, + d_rem_code_phase_chips, + correlation_sizes[correlation_sizes_idx])); + } + //wait the threads to finish they work and destroy the thread objects + for(auto &t : thread_pool) + { + t.join(); + } + thread_pool.clear(); + end = std::chrono::system_clock::now(); + elapsed_seconds = end - start; + execution_times[correlation_sizes_idx] = elapsed_seconds.count() / static_cast(FLAGS_cpu_multicorrelator_real_codes_iterations_test); + std::cout << "CPU Multicorrelator (real codes) execution time for length=" << correlation_sizes[correlation_sizes_idx] + << " : " << execution_times[correlation_sizes_idx] << " [s]" << std::endl; + + } + } + ); + + volk_gnsssdr_free(d_local_code_shift_chips); + volk_gnsssdr_free(d_correlator_outs); + volk_gnsssdr_free(d_ca_code); + volk_gnsssdr_free(in_cpu); + + for (int n = 0; n< max_threads; n++) + { + correlator_pool[n]->free(); + } +} + diff --git a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc index 11ea1ca58..661b0a19e 100644 --- a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc +++ b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc @@ -33,6 +33,9 @@ #include #include #include +#include +#include +#include #include #include "cpu_multicorrelator.h" #include "gps_sdr_signal_processing.h" From 9ec555814318a41de03650fe72004078f9791d2c Mon Sep 17 00:00:00 2001 From: Cillian O'Driscoll Date: Mon, 11 Sep 2017 15:22:32 +0100 Subject: [PATCH 3/4] Use cpu_multicorrelator_real_codes For galileo_e1_dll_pll_veml_tracking_cc and gps_l1_ca_dll_pll_tracking_cc Note this gives some significant performance improvement for higher sampling rates --- .../galileo_e1_dll_pll_veml_tracking_cc.cc | 6 +++--- .../gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h | 8 ++++---- .../gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc | 4 ++-- .../gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h | 7 ++++--- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc index d2319945c..9c23835dc 100755 --- a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc +++ b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc @@ -11,7 +11,7 @@ * * ------------------------------------------------------------------------- * - * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * Copyright (C) 2010-2017 (see AUTHORS file for a list of contributors) * * GNSS-SDR is a software defined Global Navigation * Satellite Systems receiver @@ -129,7 +129,7 @@ galileo_e1_dll_pll_veml_tracking_cc::galileo_e1_dll_pll_veml_tracking_cc( // Initialization of local code replica // Get space for a vector with the sinboc(1,1) replica sampled 2x/chip - d_ca_code = static_cast(volk_gnsssdr_malloc((2 * Galileo_E1_B_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_gnsssdr_get_alignment())); + d_ca_code = static_cast(volk_gnsssdr_malloc((2 * Galileo_E1_B_CODE_LENGTH_CHIPS) * sizeof(float), volk_gnsssdr_get_alignment())); // correlator outputs (scalar) d_n_correlator_taps = 5; // Very-Early, Early, Prompt, Late, Very-Late @@ -211,7 +211,7 @@ void galileo_e1_dll_pll_veml_tracking_cc::start_tracking() d_code_loop_filter.initialize(); // initialize the code filter // generate local reference ALWAYS starting at chip 1 (2 samples per chip) - galileo_e1_code_gen_complex_sampled(d_ca_code, + galileo_e1_code_gen_float_sampled(d_ca_code, d_acquisition_gnss_synchro->Signal, false, d_acquisition_gnss_synchro->PRN, diff --git a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h index 3377234cf..09e6fffe6 100755 --- a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h +++ b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h @@ -6,7 +6,7 @@ * * ------------------------------------------------------------------------- * - * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * Copyright (C) 2010-2017 (see AUTHORS file for a list of contributors) * * GNSS-SDR is a software defined Global Navigation * Satellite Systems receiver @@ -39,7 +39,7 @@ #include "gnss_synchro.h" #include "tracking_2nd_DLL_filter.h" #include "tracking_2nd_PLL_filter.h" -#include "cpu_multicorrelator.h" +#include "cpu_multicorrelator_real_codes.h" class galileo_e1_dll_pll_veml_tracking_cc; @@ -120,10 +120,10 @@ private: double d_early_late_spc_chips; double d_very_early_late_spc_chips; - gr_complex* d_ca_code; + float* d_ca_code; float* d_local_code_shift_chips; gr_complex* d_correlator_outs; - cpu_multicorrelator multicorrelator_cpu; + cpu_multicorrelator_real_codes multicorrelator_cpu; gr_complex *d_Very_Early; gr_complex *d_Early; diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc index 46b191d8c..903aea0d9 100644 --- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc @@ -123,7 +123,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_cc::Gps_L1_Ca_Dll_Pll_Tracking_cc( // Initialization of local code replica // Get space for a vector with the C/A code replica sampled 1x/chip - d_ca_code = static_cast(volk_gnsssdr_malloc(static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_gnsssdr_get_alignment())); + d_ca_code = static_cast(volk_gnsssdr_malloc(static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(float), volk_gnsssdr_get_alignment())); // correlator outputs (scalar) d_n_correlator_taps = 3; // Early, Prompt, and Late @@ -233,7 +233,7 @@ void Gps_L1_Ca_Dll_Pll_Tracking_cc::start_tracking() d_code_loop_filter.initialize(); // initialize the code filter // generate local reference ALWAYS starting at chip 1 (1 sample per chip) - gps_l1_ca_code_gen_complex(d_ca_code, d_acquisition_gnss_synchro->PRN, 0); + gps_l1_ca_code_gen_float(d_ca_code, d_acquisition_gnss_synchro->PRN, 0); multicorrelator_cpu.set_local_code_and_taps(static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS), d_ca_code, d_local_code_shift_chips); for (int n = 0; n < d_n_correlator_taps; n++) diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h index 92a5d4afb..da408d7d7 100644 --- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h @@ -3,6 +3,7 @@ * \brief Interface of a code DLL + carrier PLL tracking block * \author Carlos Aviles, 2010. carlos.avilesr(at)googlemail.com * Javier Arribas, 2011. jarribas(at)cttc.es + * Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com * * Code DLL + carrier PLL according to the algorithms described in: * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, @@ -44,7 +45,7 @@ #include "gnss_synchro.h" #include "tracking_2nd_DLL_filter.h" #include "tracking_2nd_PLL_filter.h" -#include "cpu_multicorrelator.h" +#include "cpu_multicorrelator_real_codes.h" class Gps_L1_Ca_Dll_Pll_Tracking_cc; @@ -126,10 +127,10 @@ private: double d_acq_carrier_doppler_hz; // correlator int d_n_correlator_taps; - gr_complex* d_ca_code; + float* d_ca_code; float* d_local_code_shift_chips; gr_complex* d_correlator_outs; - cpu_multicorrelator multicorrelator_cpu; + cpu_multicorrelator_real_codes multicorrelator_cpu; // tracking vars From 94dfef74c1f25f1f64c7838fb17f1e4be526fff3 Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Sat, 16 Sep 2017 01:14:15 +0200 Subject: [PATCH 4/4] Add cpu_multicorrelator_real_codes_test and minor cosmetics --- .../libs/galileo_e1_signal_processing.cc | 55 ++++++++----------- .../libs/gps_sdr_signal_processing.cc | 16 +++--- .../gps_l1_ca_dll_pll_tracking_cc.cc | 12 ++-- .../gps_l1_ca_dll_pll_tracking_cc.h | 1 - src/tests/test_main.cc | 1 + .../cpu_multicorrelator_real_codes_test.cc | 7 +-- 6 files changed, 42 insertions(+), 50 deletions(-) diff --git a/src/algorithms/libs/galileo_e1_signal_processing.cc b/src/algorithms/libs/galileo_e1_signal_processing.cc index 46738faad..2ea720ec8 100644 --- a/src/algorithms/libs/galileo_e1_signal_processing.cc +++ b/src/algorithms/libs/galileo_e1_signal_processing.cc @@ -55,7 +55,6 @@ void galileo_e1_code_gen_int(int* _dest, char _Signal[3], signed int _prn) hex_to_binary_converter(&_dest[index], Galileo_E1_B_PRIMARY_CODE[prn].at(i)); index = index + 4; } - } else if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2) { @@ -72,7 +71,6 @@ void galileo_e1_code_gen_int(int* _dest, char _Signal[3], signed int _prn) } - void galileo_e1_sinboc_11_gen_int(int* _dest, int* _prn, unsigned int _length_out) { const unsigned int _length_in = Galileo_E1_B_CODE_LENGTH_CHIPS; @@ -91,7 +89,6 @@ void galileo_e1_sinboc_11_gen_int(int* _dest, int* _prn, unsigned int _length_ou } - void galileo_e1_sinboc_61_gen_int(int* _dest, int* _prn, unsigned int _length_out) { const unsigned int _length_in = Galileo_E1_B_CODE_LENGTH_CHIPS; @@ -111,7 +108,6 @@ void galileo_e1_sinboc_61_gen_int(int* _dest, int* _prn, unsigned int _length_ou } - void galileo_e1_gen_float(float* _dest, int* _prn, char _Signal[3]) { std::string _galileo_signal = _Signal; @@ -160,8 +156,8 @@ void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3], const int _samplesPerChip = (_cboc == true) ? 12 : 2; const unsigned int delay = ((static_cast(Galileo_E1_B_CODE_LENGTH_CHIPS) - _chip_shift) - % static_cast(Galileo_E1_B_CODE_LENGTH_CHIPS)) - * _samplesPerCode / Galileo_E1_B_CODE_LENGTH_CHIPS; + % static_cast(Galileo_E1_B_CODE_LENGTH_CHIPS)) + * _samplesPerCode / Galileo_E1_B_CODE_LENGTH_CHIPS; galileo_e1_code_gen_int(primary_code_E1_chips, _Signal, _prn); //generate Galileo E1 code, 1 sample per chip @@ -195,29 +191,24 @@ void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3], _signal_E1 = _resampled_signal; } - if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2 && _secondary_flag) - { + { + float* _signal_E1C_secondary = new float[static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH) * _samplesPerCode]; - float* _signal_E1C_secondary = new float[ - static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH) - * _samplesPerCode]; + for (unsigned int i = 0; i < static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH); i++) + { + for (unsigned k = 0; k < _samplesPerCode; k++) + { + _signal_E1C_secondary[i*_samplesPerCode + k] = _signal_E1[k] + * (Galileo_E1_C_SECONDARY_CODE.at(i) == '0' ? 1.0f : -1.0f); + } + } - for (unsigned int i = 0; i < static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH); i++) - { - for (unsigned k = 0; k < _samplesPerCode; k++) - { - _signal_E1C_secondary[i*_samplesPerCode + k] = _signal_E1[k] - * (Galileo_E1_C_SECONDARY_CODE.at(i) == '0' - ? 1.0f : -1.0f); - } - } + _samplesPerCode *= static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH); - _samplesPerCode *= static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH); - - delete[] _signal_E1; - _signal_E1 = _signal_E1C_secondary; - } + delete[] _signal_E1; + _signal_E1 = _signal_E1C_secondary; + } for (unsigned int i = 0; i < _samplesPerCode; i++) { @@ -227,20 +218,20 @@ void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3], delete[] _signal_E1; } + void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signal[3], bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift, bool _secondary_flag) { std::string _galileo_signal = _Signal; const int _codeFreqBasis = Galileo_E1_CODE_CHIP_RATE_HZ; //Hz - unsigned int _samplesPerCode = static_cast( - static_cast(_fs) / (static_cast(_codeFreqBasis ) - / static_cast(Galileo_E1_B_CODE_LENGTH_CHIPS))); + unsigned int _samplesPerCode = static_cast( static_cast(_fs) / + (static_cast(_codeFreqBasis ) / static_cast(Galileo_E1_B_CODE_LENGTH_CHIPS))); if (_galileo_signal.rfind("1C") != std::string::npos && _galileo_signal.length() >= 2 && _secondary_flag) - { - _samplesPerCode *= static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH); - } + { + _samplesPerCode *= static_cast(Galileo_E1_C_SECONDARY_CODE_LENGTH); + } float real_code[_samplesPerCode]; @@ -252,12 +243,14 @@ void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signa } } + void galileo_e1_code_gen_float_sampled(float* _dest, char _Signal[3], bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift) { galileo_e1_code_gen_float_sampled(_dest, _Signal, _cboc, _prn, _fs, _chip_shift, false); } + void galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signal[3], bool _cboc, unsigned int _prn, signed int _fs, unsigned int _chip_shift) { diff --git a/src/algorithms/libs/gps_sdr_signal_processing.cc b/src/algorithms/libs/gps_sdr_signal_processing.cc index 168f8af97..1ddb8976a 100644 --- a/src/algorithms/libs/gps_sdr_signal_processing.cc +++ b/src/algorithms/libs/gps_sdr_signal_processing.cc @@ -113,6 +113,7 @@ void gps_l1_ca_code_gen_int(int* _dest, signed int _prn, unsigned int _chip_shif } } + void gps_l1_ca_code_gen_float(float* _dest, signed int _prn, unsigned int _chip_shift) { unsigned int _code_length = 1023; @@ -121,12 +122,12 @@ void gps_l1_ca_code_gen_float(float* _dest, signed int _prn, unsigned int _chip_ gps_l1_ca_code_gen_int( ca_code_int, _prn, _chip_shift ); for( unsigned int ii = 0; ii < _code_length; ++ii ) - { - _dest[ii] = static_cast( ca_code_int[ii] ); - } - + { + _dest[ii] = static_cast( ca_code_int[ii] ); + } } + void gps_l1_ca_code_gen_complex(std::complex* _dest, signed int _prn, unsigned int _chip_shift) { unsigned int _code_length = 1023; @@ -135,10 +136,9 @@ void gps_l1_ca_code_gen_complex(std::complex* _dest, signed int _prn, uns gps_l1_ca_code_gen_int( ca_code_int, _prn, _chip_shift ); for( unsigned int ii = 0; ii < _code_length; ++ii ) - { - _dest[ii] = std::complex< float >( static_cast(ca_code_int[ii]), 0.0f ); - } - + { + _dest[ii] = std::complex( static_cast(ca_code_int[ii]), 0.0f ); + } } diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc index 903aea0d9..32d96c079 100644 --- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc @@ -127,12 +127,12 @@ Gps_L1_Ca_Dll_Pll_Tracking_cc::Gps_L1_Ca_Dll_Pll_Tracking_cc( // correlator outputs (scalar) d_n_correlator_taps = 3; // Early, Prompt, and Late - d_correlator_outs = static_cast(volk_gnsssdr_malloc(d_n_correlator_taps*sizeof(gr_complex), volk_gnsssdr_get_alignment())); + d_correlator_outs = static_cast(volk_gnsssdr_malloc(d_n_correlator_taps * sizeof(gr_complex), volk_gnsssdr_get_alignment())); for (int n = 0; n < d_n_correlator_taps; n++) - { - d_correlator_outs[n] = gr_complex(0,0); - } - d_local_code_shift_chips = static_cast(volk_gnsssdr_malloc(d_n_correlator_taps*sizeof(float), volk_gnsssdr_get_alignment())); + { + d_correlator_outs[n] = gr_complex(0,0); + } + d_local_code_shift_chips = static_cast(volk_gnsssdr_malloc(d_n_correlator_taps * sizeof(float), volk_gnsssdr_get_alignment())); // Set TAPs delay values [chips] d_local_code_shift_chips[0] = - d_early_late_spc_chips; d_local_code_shift_chips[1] = 0.0; @@ -194,7 +194,7 @@ void Gps_L1_Ca_Dll_Pll_Tracking_cc::start_tracking() long int acq_trk_diff_samples; double acq_trk_diff_seconds; acq_trk_diff_samples = static_cast(d_sample_counter) - static_cast(d_acq_sample_stamp); //-d_vector_length; - DLOG(INFO) << "Number of samples between Acquisition and Tracking =" << acq_trk_diff_samples; + DLOG(INFO) << "Number of samples between Acquisition and Tracking = " << acq_trk_diff_samples; acq_trk_diff_seconds = static_cast(acq_trk_diff_samples) / static_cast(d_fs_in); // Doppler effect // Fd=(C/(C+Vr))*F diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h index da408d7d7..9fe80fec7 100644 --- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.h @@ -132,7 +132,6 @@ private: gr_complex* d_correlator_outs; cpu_multicorrelator_real_codes multicorrelator_cpu; - // tracking vars double d_code_freq_chips; double d_code_phase_step_chips; diff --git a/src/tests/test_main.cc b/src/tests/test_main.cc index dfdb5f3ea..bae669086 100644 --- a/src/tests/test_main.cc +++ b/src/tests/test_main.cc @@ -110,6 +110,7 @@ DECLARE_string(log_dir); #include "unit-tests/signal-processing-blocks/tracking/galileo_e5a_tracking_test.cc" #include "unit-tests/signal-processing-blocks/tracking/tracking_loop_filter_test.cc" #include "unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_test.cc" +#include "unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc" #if CUDA_BLOCKS_TEST #include "unit-tests/signal-processing-blocks/tracking/gpu_multicorrelator_test.cc" diff --git a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc index 100cc6985..ef3c2a293 100644 --- a/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc +++ b/src/tests/unit-tests/signal-processing-blocks/tracking/cpu_multicorrelator_real_codes_test.cc @@ -45,7 +45,7 @@ DEFINE_int32(cpu_multicorrelator_real_codes_iterations_test, 1000, "Number of averaged iterations in CPU multicorrelator test timing test"); DEFINE_int32(cpu_multicorrelator_real_codes_max_threads_test, 12, "Number of maximum concurrent correlators in CPU multicorrelator test timing test"); -void run_correlator_cpu(cpu_multicorrelator_real_codes* correlator, +void run_correlator_cpu_real_codes(cpu_multicorrelator_real_codes* correlator, float d_rem_carrier_phase_rad, float d_carrier_phase_step_rad, float d_code_phase_step_chips, @@ -137,7 +137,7 @@ TEST(CpuMulticorrelatorRealCodesTest, MeasureExecutionTime) //create the concurrent correlator threads for (int current_thread = 0; current_thread < current_max_threads; current_thread++) { - thread_pool.push_back(std::thread(run_correlator_cpu, + thread_pool.push_back(std::thread(run_correlator_cpu_real_codes, correlator_pool[current_thread], d_rem_carrier_phase_rad, d_carrier_phase_step_rad, @@ -156,7 +156,6 @@ TEST(CpuMulticorrelatorRealCodesTest, MeasureExecutionTime) execution_times[correlation_sizes_idx] = elapsed_seconds.count() / static_cast(FLAGS_cpu_multicorrelator_real_codes_iterations_test); std::cout << "CPU Multicorrelator (real codes) execution time for length=" << correlation_sizes[correlation_sizes_idx] << " : " << execution_times[correlation_sizes_idx] << " [s]" << std::endl; - } } ); @@ -166,7 +165,7 @@ TEST(CpuMulticorrelatorRealCodesTest, MeasureExecutionTime) volk_gnsssdr_free(d_ca_code); volk_gnsssdr_free(in_cpu); - for (int n = 0; n< max_threads; n++) + for (int n = 0; n < max_threads; n++) { correlator_pool[n]->free(); }