Enable NEON kernels in aarch64 architecture

This commit is contained in:
Carles Fernandez 2021-08-11 13:01:25 +02:00
parent fc85b2df8e
commit 8982e4d7ad
No known key found for this signature in database
GPG Key ID: 4C583C52B0C3877D
34 changed files with 260 additions and 90 deletions

View File

@ -77,6 +77,8 @@ All notable changes to GNSS-SDR will be documented in this file.
x86, improved AMD microarchitecture detection.
- CMake now selects the C++23 standard if the environment allows for it.
- Improved detection of Gnuplot and `gnss_sim` when cross-compiling.
- NEON kernel implementations of the `volk_gnsssdr` library are now enabled in
aarch64 architectures.
### Improvements in Reliability

View File

@ -17,7 +17,8 @@
#include <arm_neon.h>
static inline float32x4_t vdivq_f32(float32x4_t num, float32x4_t den)
/* Division */
static inline float32x4_t _vdivq_f32(float32x4_t num, float32x4_t den)
{
const float32x4_t q_inv0 = vrecpeq_f32(den);
const float32x4_t q_step0 = vrecpsq_f32(q_inv0, den);
@ -26,8 +27,28 @@ static inline float32x4_t vdivq_f32(float32x4_t num, float32x4_t den)
return vmulq_f32(num, q_inv1);
}
/* Inverse */
static inline float32x4_t _vinvq_f32(float32x4_t x)
{
// Newton's method
float32x4_t recip = vrecpeq_f32(x);
recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
return recip;
}
static inline float32x4_t vsqrtq_f32(float32x4_t q_x)
/* Magnitude squared for float32x4x2_t */
static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
{
float32x4_t iValue, qValue, result;
iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
return result;
}
/* Square root for float32x4_t */
static inline float32x4_t _vsqrtq_f32(float32x4_t q_x)
{
const float32x4_t q_step_0 = vrsqrteq_f32(q_x);
// step
@ -43,18 +64,6 @@ static inline float32x4_t vsqrtq_f32(float32x4_t q_x)
return vmulq_f32(q_x, q_step_2);
}
/* Magnitude squared for float32x4x2_t */
static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
{
float32x4_t iValue, qValue, result;
iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
return result;
}
/* Inverse square root for float32x4_t */
static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
{
@ -65,7 +74,6 @@ static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
return sqrt_reciprocal;
}
/* Complex multiplication for float32x4x2_t */
static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
{
@ -89,4 +97,164 @@ static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32
return c_val;
}
/* From ARM Compute Library, MIT license */
static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
{
float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
float32x4_t x2 = vmulq_f32(x, x);
float32x4_t x4 = vmulq_f32(x2, x2);
float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
return res;
}
/* Natural logarithm.
* From ARM Compute Library, MIT license */
static inline float32x4_t _vlogq_f32(float32x4_t x)
{
const float32x4_t log_tab[8] = {
vdupq_n_f32(-2.29561495781f),
vdupq_n_f32(-2.47071170807f),
vdupq_n_f32(-5.68692588806f),
vdupq_n_f32(-0.165253549814f),
vdupq_n_f32(5.17591238022f),
vdupq_n_f32(0.844007015228f),
vdupq_n_f32(4.58445882797f),
vdupq_n_f32(0.0141278216615f),
};
const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
// Extract exponent
int32x4_t m = vsubq_s32(
vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
float32x4_t val =
vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
// Polynomial Approximation
float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
// Reconstruct
poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
return poly;
}
/* Evaluation of 4 sines & cosines at once.
* Optimized from here (zlib license)
* http://gruntthepeon.free.fr/ssemath/ */
static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
{
const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
const float32x4_t CONST_1 = vdupq_n_f32(1.f);
const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
const float32x4_t CONST_0 = vdupq_n_f32(0.f);
const uint32x4_t CONST_2 = vdupq_n_u32(2);
const uint32x4_t CONST_4 = vdupq_n_u32(4);
uint32x4_t emm2;
uint32x4_t sign_mask_sin, sign_mask_cos;
sign_mask_sin = vcltq_f32(x, CONST_0);
x = vabsq_f32(x);
// scale by 4/pi
float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
// store the integer part of y in mm0
emm2 = vcvtq_u32_f32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
y = vcvtq_f32_u32(emm2);
/* get the polynom selection mask
there is one polynom for 0 <= x <= Pi/4
and another one for Pi/4<x<=Pi/2
Both branches will be computed. */
const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
// The magic pass: "Extended precision modular arithmetic"
x = vmlaq_f32(x, y, c_minus_cephes_DP1);
x = vmlaq_f32(x, y, c_minus_cephes_DP2);
x = vmlaq_f32(x, y, c_minus_cephes_DP3);
sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
and the second polynom (Pi/4 <= x <= 0) in y2 */
float32x4_t y1, y2;
float32x4_t z = vmulq_f32(x, x);
y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
y1 = vmlaq_f32(c_coscof_p2, z, y1);
y1 = vmulq_f32(y1, z);
y1 = vmulq_f32(y1, z);
y1 = vmlsq_f32(y1, z, CONST_1_2);
y1 = vaddq_f32(y1, CONST_1);
y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
y2 = vmlaq_f32(c_sincof_p2, z, y2);
y2 = vmulq_f32(y2, z);
y2 = vmlaq_f32(x, x, y2);
/* select the correct result from the two polynoms */
const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
float32x4x2_t sincos;
sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
return sincos;
}
static inline float32x4_t _vsinq_f32(float32x4_t x)
{
const float32x4x2_t sincos = _vsincosq_f32(x);
return sincos.val[0];
}
static inline float32x4_t _vcosq_f32(float32x4_t x)
{
const float32x4x2_t sincos = _vsincosq_f32(x);
return sincos.val[1];
}
static inline float32x4_t _vtanq_f32(float32x4_t x)
{
const float32x4x2_t sincos = _vsincosq_f32(x);
return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
}
static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
float32x4_t acc,
float32x4_t val,
float32x4_t rec,
float32x4_t aux)
{
aux = vmulq_f32(aux, val);
aux = vsubq_f32(aux, acc);
aux = vmulq_f32(aux, aux);
#ifdef LV_HAVE_NEONV8
return vfmaq_f32(sq_acc, aux, rec);
#else
aux = vmulq_f32(aux, rec);
return vaddq_f32(sq_acc, aux);
#endif
}
#endif /* INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_ */

View File

@ -235,7 +235,7 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result,
#endif
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;

View File

@ -512,7 +512,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
#endif
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, const int16_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{

View File

@ -233,7 +233,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_avx2(lv_32fc_t* outputVector
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
@ -263,6 +263,6 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
_in++;
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_H */

View File

@ -242,7 +242,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
@ -328,6 +328,6 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_resampler_fast_16ic_H */

View File

@ -58,7 +58,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_u_sse2(lv_16sc_t*
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
@ -68,6 +68,6 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_neon(lv_16sc_t* re
volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H

View File

@ -114,7 +114,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_u_sse2(lv_16sc_t
#endif
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
float code_phase_step_chips = 0.1;

View File

@ -236,7 +236,7 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res
#endif
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;

View File

@ -123,7 +123,7 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_u_sse3_reload(lv_16sc_t*
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_neon(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
@ -136,10 +136,10 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_neon(lv_16sc_t* outVecto
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(outVector, inVector, phase_inc[0], phase, num_points);
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_neon_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
@ -152,7 +152,7 @@ static inline void volk_gnsssdr_16ic_rotatorpuppet_16ic_neon_reload(lv_16sc_t* o
volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(outVector, inVector, phase_inc[0], phase, num_points);
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_rotatorpuppet_16ic_H */

View File

@ -631,7 +631,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
@ -764,10 +764,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
@ -958,6 +958,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_H */

View File

@ -377,7 +377,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
@ -446,10 +446,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
@ -499,10 +499,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
@ -553,6 +553,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H */

View File

@ -473,7 +473,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
@ -559,10 +559,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
}
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
@ -637,10 +637,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
}
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
@ -720,6 +720,6 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
}
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H */

View File

@ -174,7 +174,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* r
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
@ -199,7 +199,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* res
#endif // NEON
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
@ -223,7 +223,7 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t*
#endif // NEON
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{

View File

@ -278,7 +278,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
@ -324,6 +324,6 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const
*out++ = (*a_ptr++) * (*b_ptr++);
}
}
#endif /* LV_HAVE_NEONV7*/
#endif /* LV_HAVE_NEON*/
#endif /* INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H */

View File

@ -1286,7 +1286,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
@ -1472,10 +1472,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h>
#include <arm_neon.h>
@ -1610,11 +1610,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
// Round = vmulq_f32(_phase_real, _phase_real);
// Round = vmlaq_f32(Round, _phase_imag, _phase_imag);
// Round = vsqrtq_f32(Round);// printf("sqrt: %f \n", Round[0]);
// Round = _vsqrtq_f32(Round);// printf("sqrt: %f \n", Round[0]);
// Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]);
// Round = vrecpeq_f32((Round);
// _phase_real = vdivq_f32(_phase_real, Round);
// _phase_imag = vdivq_f32(_phase_imag, Round);
// _phase_real = _vdivq_f32(_phase_real, Round);
// _phase_imag = _vdivq_f32(_phase_imag, Round);
// _phase_real = vmulq_f32(_phase_real, Round);
// _phase_imag = vmulq_f32(_phase_imag, Round);
// printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0]));
@ -1669,10 +1669,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h>
#include <arm_neon.h>
@ -1858,6 +1858,6 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H */

View File

@ -303,7 +303,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2_relo
#endif // AVX2
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
@ -334,7 +334,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16s
#endif // NEON
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!

View File

@ -511,7 +511,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
#endif
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)
{

View File

@ -271,7 +271,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
@ -370,6 +370,6 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_H */

View File

@ -467,7 +467,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, cons
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
@ -532,6 +532,6 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
}
}
#endif /*LV_HAVE_NEONV7*/
#endif /* LV_HAVE_NEON */
#endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/
#endif /* INCLUDED_volk_gnsssdr_32f_index_max_32u_H */

View File

@ -232,7 +232,7 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, c
}
#endif
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points)
{
int code_length_chips = 2046;

View File

@ -627,7 +627,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float* in, unsigned int num_points)
@ -731,7 +731,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_32f_sincos_32fc_H */

View File

@ -600,7 +600,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa
#endif
//
//
// #ifdef LV_HAVE_NEONV7
// #ifdef LV_HAVE_NEON
// #include <arm_neon.h>
//
// static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)

View File

@ -513,7 +513,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
#endif
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)

View File

@ -372,7 +372,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
@ -436,7 +436,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#ifdef LV_HAVE_GENERIC

View File

@ -359,7 +359,7 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points)
@ -450,6 +450,6 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_H */

View File

@ -292,7 +292,7 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* re
#endif
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;

View File

@ -633,7 +633,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
@ -786,6 +786,6 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
(*phase) = _phase;
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */

View File

@ -206,7 +206,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32
#endif // AVX
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
@ -233,7 +233,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32f
volk_gnsssdr_free(in_a);
}
#endif // AVX
#endif // LV_HAVE_NEON
#endif // INCLUDED_volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_H

View File

@ -668,7 +668,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
#endif
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** result, const lv_32fc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points)

View File

@ -334,7 +334,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const
#endif /* LV_HAVE_ORC */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_8ic_conjugate_8ic_neon(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points)
@ -360,6 +360,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_neon(lv_8sc_t* cVector, const
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_H */

View File

@ -422,7 +422,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, cons
#endif /* LV_HAVE_ORC */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
@ -481,6 +481,6 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const
*result += dotProduct;
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_H*/

View File

@ -816,7 +816,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const fl
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
@ -930,6 +930,6 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const floa
(*phase) = _phase;
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_s32f_sincos_32fc_H */

View File

@ -89,13 +89,13 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_avx2(lv_32fc_t* out, co
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEONV7
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_s32f_sincospuppet_32fc_neon(lv_32fc_t* out, const float phase_inc, unsigned int num_points)
{
float phase[1];
phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_neon(out, phase_inc, phase, num_points);
}
#endif /* LV_HAVE_NEONV7 */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */