From c4db5476fb84aae24fe856d4ce78497dffea18bd Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Mon, 19 Aug 2019 10:49:37 +0200 Subject: [PATCH 1/3] Always have a space between // and comment --- .../volk_gnsssdr/cmake/msvc/sys/time.h | 6 +- .../volk_gnsssdr/volk_gnsssdr_common.h | 28 +- .../volk_gnsssdr_16i_xn_resampler_16i_xn.h | 18 +- ...nsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h | 906 +----------------- ...dr_16ic_16i_rotator_dotprodxnpuppet_16ic.h | 152 --- .../volk_gnsssdr_16ic_conjugate_16ic.h | 30 - .../volk_gnsssdr_16ic_resampler_fast_16ic.h | 80 +- .../volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h | 112 +-- .../volk_gnsssdr_16ic_x2_dot_prod_16ic.h | 22 +- .../volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h | 28 +- .../volk_gnsssdr_16ic_x2_multiply_16ic.h | 18 +- ...gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h | 228 ++--- .../volk_gnsssdr_16ic_xn_resampler_16ic_xn.h | 22 +- ...k_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h | 94 +- ..._32f_high_dynamics_resamplerxnpuppet_32f.h | 29 +- .../volk_gnsssdr_32f_sincos_32fc.h | 11 +- ...dr_32f_xn_high_dynamics_resampler_32f_xn.h | 20 +- .../volk_gnsssdr_32f_xn_resampler_32f_xn.h | 20 +- ...nsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h | 14 +- .../volk_gnsssdr_32fc_convert_16ic.h | 4 +- ...gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h | 16 +- .../volk_gnsssdr_32fc_xn_resampler_32fc_xn.h | 34 +- .../volk_gnsssdr_8i_index_max_16u.h | 6 +- .../volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h | 2 +- .../volk_gnsssdr_8ic_conjugate_8ic.h | 4 +- .../volk_gnsssdr_8ic_magnitude_squared_8i.h | 80 -- .../volk_gnsssdr_s32f_sincos_32fc.h | 17 +- 27 files changed, 452 insertions(+), 1549 deletions(-) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/cmake/msvc/sys/time.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/cmake/msvc/sys/time.h index fc62a7041..35cea7efd 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/cmake/msvc/sys/time.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/cmake/msvc/sys/time.h @@ -9,9 +9,9 @@ #define NOMINMAX #endif -//http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 +// http://social.msdn.microsoft.com/Forums/en/vcgeneral/thread/430449b3-f6dd-4e18-84de-eebd26a8d668 #include < time.h > -#include //I've omitted this line. +#include // I've omitted this line. #if defined(_MSC_VER) || defined(_MSC_EXTENSIONS) #define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64 #else @@ -51,7 +51,7 @@ static inline int gettimeofday(struct timeval *tv, struct timezone *tz) tmpres <<= 32; tmpres |= ft.dwLowDateTime; - /*converting file time to unix epoch*/ + /* converting file time to unix epoch*/ tmpres -= DELTA_EPOCH_IN_MICROSECS; tv->tv_sec = (long)(tmpres / 1000000UL); tv->tv_usec = (long)(tmpres % 1000000UL); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h index bb2029f9a..f325d7c1f 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h @@ -24,9 +24,9 @@ #ifndef INCLUDED_LIBVOLK_GNSSSDR_COMMON_H #define INCLUDED_LIBVOLK_GNSSSDR_COMMON_H -//////////////////////////////////////////////////////////////////////// +// // Cross-platform attribute macros not included in VOLK -//////////////////////////////////////////////////////////////////////// +// #if defined __GNUC__ #define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr) #define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality) @@ -41,9 +41,9 @@ #ifndef INCLUDED_LIBVOLK_COMMON_H #define INCLUDED_LIBVOLK_COMMON_H -//////////////////////////////////////////////////////////////////////// +// // Cross-platform attribute macros -//////////////////////////////////////////////////////////////////////// +// #if defined __GNUC__ #define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) #define __VOLK_ATTR_UNUSED __attribute__((unused)) @@ -78,18 +78,18 @@ #define __VOLK_VOLATILE __volatile__ #endif -//////////////////////////////////////////////////////////////////////// +// // Ignore annoying warnings in MSVC -//////////////////////////////////////////////////////////////////////// +// #if defined(_MSC_VER) -#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data -#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2' +#pragma warning(disable : 4244) // 'conversion' conversion from 'type1' to 'type2', possible loss of data +#pragma warning(disable : 4305) // 'identifier' : truncation from 'type1' to 'type2' #endif -//////////////////////////////////////////////////////////////////////// +// // C-linkage declaration macros // FIXME: due to the usage of complex.h, require gcc for c-linkage -//////////////////////////////////////////////////////////////////////// +// #if defined(__cplusplus) && (__GNUC__) #define __VOLK_DECL_BEGIN \ extern "C" \ @@ -100,19 +100,19 @@ #define __VOLK_DECL_END #endif -//////////////////////////////////////////////////////////////////////// +// // Define VOLK_API for library symbols // http://gcc.gnu.org/wiki/Visibility -//////////////////////////////////////////////////////////////////////// +// #ifdef volk_gnsssdr_EXPORTS #define VOLK_API __VOLK_ATTR_EXPORT #else #define VOLK_API __VOLK_ATTR_IMPORT #endif -//////////////////////////////////////////////////////////////////////// +// // The bit128 union used by some -//////////////////////////////////////////////////////////////////////// +// #include #ifdef LV_HAVE_SSE diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h index 6c9320311..c63e5151e 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h @@ -84,7 +84,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_generic(int16_t** result { // resample code for current tap local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); local_code_chip_index = local_code_chip_index % code_length_chips; result[current_correlator_tap][n] = local_code[local_code_chip_index]; @@ -150,7 +150,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -217,7 +217,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -288,7 +288,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -359,7 +359,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -437,7 +437,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -515,7 +515,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -568,7 +568,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c aux = vmulq_f32(code_phase_step_chips_reg, indexn); aux = vaddq_f32(aux, aux2); - //floor + // floor i = vcvtq_s32_f32(aux); fi = vcvtq_f32_s32(i); igx = vcgtq_f32(fi, aux); @@ -600,7 +600,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h index b2a9cee32..496ab809d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h @@ -74,7 +74,7 @@ #include #include #include -//#include +// #include #ifdef LV_HAVE_GENERIC @@ -90,33 +90,33 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(lv_16s } for (n = 0; n < num_points; n++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); // Regenerate phase if (n % 256 == 0) { - //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif - //printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); } (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC @@ -138,14 +138,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload { for (j = 0; j < ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } @@ -153,27 +153,27 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else - //(*phase) /= cabsf((*phase)); + // (*phase) /= cabsf((*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif } for (j = 0; j < num_points % ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE3 @@ -224,9 +224,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: - //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + // printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -235,7 +235,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -243,11 +243,11 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -256,7 +256,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -267,12 +267,12 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc // store four rotated in_common samples in the register b b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic - //next two samples + // next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_unpacklo_epi16(a, a); @@ -312,12 +312,12 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); - //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); + // (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; for (n = sse_iters * 4; n < num_points; n++) { - tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = in_common[n]; // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; @@ -325,7 +325,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } @@ -334,254 +334,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc #endif /* LV_HAVE_SSE3 */ -//#ifdef LV_HAVE_SSE3 -//#include - -//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) -//{ -//lv_16sc_t dotProduct = lv_cmake(0,0); - -//const unsigned int sse_iters = num_points / 4; -//const unsigned int ROTATOR_RELOAD = 128; -//int n_vec; -//int i; -//unsigned int number; -//unsigned int j; -//unsigned int n; - -//const int16_t** _in_a = in_a; -//const lv_16sc_t* _in_common = in_common; -//lv_16sc_t* _out = result; - -//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - -//__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); -//__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//realcacc[n_vec] = _mm_setzero_si128(); -//imagcacc[n_vec] = _mm_setzero_si128(); -//} - -//__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl; - -//mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); -//mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); - -//// phase rotation registers -//__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; -//__m128i pc1, pc2; -//__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; -//two_phase_inc[0] = phase_inc * phase_inc; -//two_phase_inc[1] = phase_inc * phase_inc; -//two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); -//__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; -//two_phase_acc[0] = (*phase); -//two_phase_acc[1] = (*phase) * phase_inc; -//two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); -//__m128 yl, yh, tmp1, tmp2, tmp3; -//lv_16sc_t tmp16; -//lv_32fc_t tmp32; - -//for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) -//{ -//for (j = 0; j < ROTATOR_RELOAD; j++) -//{ -//// Phase rotation on operand in_common starts here: -////printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); -//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg -////complex 32fc multiplication b=a*two_phase_acc_reg -//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr -//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di -//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr -//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br -//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di -//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di -//pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - -////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg -//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr -//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di -//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr -//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br -//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di -//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - -////next two samples -//_in_common += 2; -//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg -//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); -////complex 32fc multiplication b=a*two_phase_acc_reg -//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr -//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di -//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr -//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br -//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di -//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di -//pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - -////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg -//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr -//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di -//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr -//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br -//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di -//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - -//// store four rotated in_common samples in the register b -//b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic - -////next two samples -//_in_common += 2; - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - -//c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - -//c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. -//real = _mm_subs_epi16(c, c_sr); - -//b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... -//a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - -//imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... -//imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - -//imag = _mm_adds_epi16(imag1, imag2); - -//realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); -//imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); -//} -//} -//// regenerate phase -//tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); -//tmp2 = _mm_hadd_ps(tmp1, tmp1); -//tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); -//tmp2 = _mm_sqrt_ps(tmp1); -//two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); -//} - -//for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) -//{ -//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg -////complex 32fc multiplication b=a*two_phase_acc_reg -//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr -//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di -//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr -//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br -//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di -//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di -//pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - -////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg -//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr -//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di -//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr -//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br -//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di -//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - -////next two samples -//_in_common += 2; -//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg -//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); -////complex 32fc multiplication b=a*two_phase_acc_reg -//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr -//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di -//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr -//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br -//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di -//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di -//pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - -////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg -//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr -//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di -//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr -//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br -//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di -//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - -//// store four rotated in_common samples in the register b -//b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic - -////next two samples -//_in_common += 2; - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - -//c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - -//c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. -//real = _mm_subs_epi16(c, c_sr); - -//b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... -//a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - -//imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... -//imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - -//imag = _mm_adds_epi16(imag1, imag2); - -//realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); -//imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); -//} -//} - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real); -//imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag); - -//a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - -//_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector -//dotProduct = lv_cmake(0,0); -//for (i = 0; i < 4; ++i) -//{ -//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), -//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); -//} -//_out[n_vec] = dotProduct; -//} - -//volk_gnsssdr_free(realcacc); -//volk_gnsssdr_free(imagcacc); - -//tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); -//tmp2 = _mm_hadd_ps(tmp1, tmp1); -//tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); -//tmp2 = _mm_sqrt_ps(tmp1); -//two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); - -//_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); -////(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); -//(*phase) = two_phase_acc[0]; - -//for(n = sse_iters * 4; n < num_points; n++) -//{ -//tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); -//tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); -//tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); -//(*phase) *= phase_inc; - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; -////lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); -//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), -//sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); -//} -//} - -//} -//#endif [> LV_HAVE_SSE3 <] - #ifdef LV_HAVE_SSE3 #include @@ -631,9 +383,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: - //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + // printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -642,7 +394,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -650,11 +402,11 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -663,7 +415,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -674,12 +426,12 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc // store four rotated in_common samples in the register b b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic - //next two samples + // next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_unpacklo_epi16(a, a); @@ -719,12 +471,12 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); - //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); + // (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; for (n = sse_iters * 4; n < num_points; n++) { - tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = in_common[n]; // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; @@ -732,7 +484,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } @@ -801,7 +553,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc __m256i a2, b2, c, c1, c2, perm_idx; perm_idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); + // perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); for (number = 0; number < avx2_iters; number++) { @@ -809,24 +561,24 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc ai = _mm256_cvtepi16_epi32(a128); a = _mm256_cvtepi32_ps(ai); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg b = _mm256_complexmul_ps(a, four_phase_acc_reg); c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); - //next four samples + // next four samples _in_common += 4; a128 = _mm_load_si128((__m128i*)_in_common); ai = _mm256_cvtepi16_epi32(a128); a = _mm256_cvtepi32_ps(ai); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg b = _mm256_complexmul_ps(a, four_phase_acc_reg); c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); @@ -955,7 +707,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc __m256i a2, b2, c, c1, c2, perm_idx; perm_idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); - //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); + // perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); for (number = 0; number < avx2_iters; number++) { @@ -963,35 +715,33 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc ai = _mm256_cvtepi16_epi32(a128); a = _mm256_cvtepi32_ps(ai); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg b = _mm256_complexmul_ps(a, four_phase_acc_reg); c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); - //next four samples + // next four samples _in_common += 4; a128 = _mm_loadu_si128((__m128i*)_in_common); ai = _mm256_cvtepi16_epi32(a128); a = _mm256_cvtepi32_ps(ai); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg b = _mm256_complexmul_ps(a, four_phase_acc_reg); c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); - // Store and convert 32ic to 16ic: b2 = _mm256_packs_epi32(c1, c2); b2 = _mm256_permutevar8x32_epi32(b2, perm_idx); - _in_common += 4; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { @@ -1049,558 +799,4 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc } #endif /* LV_HAVE_AVX2 */ -//#ifdef LV_HAVE_NEONV7 -//#include - -//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) -//{ -//const unsigned int neon_iters = num_points / 4; - -//const int16_t** _in_a = in_a; -//const lv_16sc_t* _in_common = in_common; -//lv_16sc_t* _out = result; -//int n_vec; -//int i; -//unsigned int number; -//unsigned int n; -//lv_16sc_t tmp16_, tmp; -//lv_32fc_t tmp32_; - -//if (neon_iters > 0) -//{ -//lv_16sc_t dotProduct = lv_cmake(0,0); -//float arg_phase0 = cargf(*phase); -//float arg_phase_inc = cargf(phase_inc); -//float phase_est; - -//lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; -//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; -//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; - -//float32x4_t _phase4_real = vld1q_f32(__phase4_real); -//float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - -//lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; -//lv_32fc_t phase3 = phase2 * phase_inc; -//lv_32fc_t phase4 = phase3 * phase_inc; - -//__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; -//__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - -//float32x4_t _phase_real = vld1q_f32(__phase_real); -//float32x4_t _phase_imag = vld1q_f32(__phase_imag); - -//int16x4x2_t a_val, b_val, c_val; -//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; -//float32x4_t half = vdupq_n_f32(0.5f); -//int16x4x2_t tmp16; -//int32x4x2_t tmp32i; - -//float32x4x2_t tmp32f, tmp32_real, tmp32_imag; -//float32x4_t sign, PlusHalf, Round; - -//int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - -//for(n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//accumulator[n_vec].val[0] = vdup_n_s16(0); -//accumulator[n_vec].val[1] = vdup_n_s16(0); -//} - -//for(number = 0; number < neon_iters; number++) -//{ -//[> load 4 complex numbers (int 16 bits each component) <] -//tmp16 = vld2_s16((int16_t*)_in_common); -//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); -//_in_common += 4; - -//[> promote them to int 32 bits <] -//tmp32i.val[0] = vmovl_s16(tmp16.val[0]); -//tmp32i.val[1] = vmovl_s16(tmp16.val[1]); - -//[> promote them to float 32 bits <] -//tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); -//tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); - -//[> complex multiplication of four complex samples (float 32 bits each component) <] -//tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); -//tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); -//tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); -//tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); - -//tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); -//tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - -//[> downcast results to int32 <] -//[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] -//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); -//PlusHalf = vaddq_f32(tmp32f.val[0], half); -//Round = vsubq_f32(PlusHalf, sign); -//tmp32i.val[0] = vcvtq_s32_f32(Round); - -//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); -//PlusHalf = vaddq_f32(tmp32f.val[1], half); -//Round = vsubq_f32(PlusHalf, sign); -//tmp32i.val[1] = vcvtq_s32_f32(Round); - -//[> downcast results to int16 <] -//tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); -//tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); - -//[> compute next four phases <] -//tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); -//tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); -//tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); -//tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); - -//_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); -//_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg -////__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); - -//// multiply the real*real and imag*imag to get real result -//// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r -//b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); -//// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i -//b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[1]); -//c_val.val[0] = vqsub_s16(b_val.val[0], b_val.val[1]); - -//// Multiply cross terms to get the imaginary result -//// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i -//b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[1]); -//// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r -//b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); -//c_val.val[1] = vqadd_s16(b_val.val[0], b_val.val[1]); - -//accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], c_val.val[0]); -//accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], c_val.val[1]); -//} -//// Regenerate phase -//if ((number % 256) == 0) -//{ -//phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; - -//*phase = lv_cmake(cos(phase_est), sin(phase_est)); -//phase2 = (lv_32fc_t)(*phase) * phase_inc; -//phase3 = phase2 * phase_inc; -//phase4 = phase3 * phase_inc; - -//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; -//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - -//_phase_real = vld1q_f32(____phase_real); -//_phase_imag = vld1q_f32(____phase_imag); -//} -//} - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector -//dotProduct = lv_cmake(0,0); -//for (i = 0; i < 4; ++i) -//{ -//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), -//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); -//} -//_out[n_vec] = dotProduct; -//} -//volk_gnsssdr_free(accumulator); -//vst1q_f32((float32_t*)__phase_real, _phase_real); -//vst1q_f32((float32_t*)__phase_imag, _phase_imag); - -//(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); -//} - -//for (n = neon_iters * 4; n < num_points; n++) -//{ -//tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); -//tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); -//tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); -//(*phase) *= phase_inc; -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//tmp = tmp16_ * in_a[n_vec][n]; -//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); -//} -//} -//} - -//#endif [> LV_HAVE_NEONV7 <] - - -//#ifdef LV_HAVE_NEONV7 -//#include -//#include - -//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) -//{ -//const unsigned int neon_iters = num_points / 4; - -//const int16_t** _in_a = in_a; -//const lv_16sc_t* _in_common = in_common; -//lv_16sc_t* _out = result; -//int n_vec; -//int i; -//unsigned int number; -//unsigned int n; -//lv_16sc_t tmp16_, tmp; -//lv_32fc_t tmp32_; - -//if (neon_iters > 0) -//{ -//lv_16sc_t dotProduct = lv_cmake(0,0); -//float arg_phase0 = cargf(*phase); -//float arg_phase_inc = cargf(phase_inc); -//float phase_est; -////printf("arg phase0: %f", arg_phase0); -//lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; -//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; -//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; - -//float32x4_t _phase4_real = vld1q_f32(__phase4_real); -//float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - -//lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; -//lv_32fc_t phase3 = phase2 * phase_inc; -//lv_32fc_t phase4 = phase3 * phase_inc; - -//__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; -//__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - -//float32x4_t _phase_real = vld1q_f32(__phase_real); -//float32x4_t _phase_imag = vld1q_f32(__phase_imag); - -//int16x4x2_t a_val, b_val; -//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; -//float32x4_t half = vdupq_n_f32(0.5f); -//int16x4x2_t tmp16; -//int32x4x2_t tmp32i; - -//float32x4x2_t tmp32f, tmp32_real, tmp32_imag; -//float32x4_t sign, PlusHalf, Round; - -//int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - -//for(n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//accumulator[n_vec].val[0] = vdup_n_s16(0); -//accumulator[n_vec].val[1] = vdup_n_s16(0); -//} - -//for(number = 0; number < neon_iters; number++) -//{ -//[> load 4 complex numbers (int 16 bits each component) <] -//tmp16 = vld2_s16((int16_t*)_in_common); -//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); -//_in_common += 4; - -//[> promote them to int 32 bits <] -//tmp32i.val[0] = vmovl_s16(tmp16.val[0]); -//tmp32i.val[1] = vmovl_s16(tmp16.val[1]); - -//[> promote them to float 32 bits <] -//tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); -//tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); - -//[> complex multiplication of four complex samples (float 32 bits each component) <] -//tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); -//tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); -//tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); -//tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); - -//tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); -//tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - -//[> downcast results to int32 <] -//[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] -//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); -//PlusHalf = vaddq_f32(tmp32f.val[0], half); -//Round = vsubq_f32(PlusHalf, sign); -//tmp32i.val[0] = vcvtq_s32_f32(Round); - -//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); -//PlusHalf = vaddq_f32(tmp32f.val[1], half); -//Round = vsubq_f32(PlusHalf, sign); -//tmp32i.val[1] = vcvtq_s32_f32(Round); - -//[> downcast results to int16 <] -//tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); -//tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); - -//[> compute next four phases <] -//tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); -//tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); -//tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); -//tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); - -//_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); -//_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - -//// Regenerate phase -//if ((number % 256) == 0) -//{ -////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); -//phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; -////printf("Estimated phase: %f\n\n", cos(phase_est)); - -//*phase = lv_cmake(cos(phase_est), sin(phase_est)); -//phase2 = (lv_32fc_t)(*phase) * phase_inc; -//phase3 = phase2 * phase_inc; -//phase4 = phase3 * phase_inc; - -//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; -//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - -//_phase_real = vld1q_f32(____phase_real); -//_phase_imag = vld1q_f32(____phase_imag); - -//// Round = vmulq_f32(_phase_real, _phase_real); -//// Round = vmlaq_f32(Round, _phase_imag, _phase_imag); -//// Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); -////Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); -////Round = vrecpeq_f32((Round); -//// _phase_real = vdivq_f32(_phase_real, Round); -//// _phase_imag = vdivq_f32(_phase_imag, Round); -////_phase_real = vmulq_f32(_phase_real, Round); -////_phase_imag = vmulq_f32(_phase_imag, Round); -////printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); - -//} - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); - -//b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); -//b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); - -//// use multiply accumulate/subtract to get result -//b_val.val[0] = vmls_s16(b_val.val[0], a_val.val[1], tmp16.val[1]); -//b_val.val[1] = vmla_s16(b_val.val[1], a_val.val[0], tmp16.val[1]); - -//accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], b_val.val[0]); -//accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], b_val.val[1]); -//} -//} - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector -//dotProduct = lv_cmake(0,0); -//for (i = 0; i < 4; ++i) -//{ -//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), -//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); -//} -//_out[n_vec] = dotProduct; -//} -//volk_gnsssdr_free(accumulator); - -//vst1q_f32((float32_t*)__phase_real, _phase_real); -//vst1q_f32((float32_t*)__phase_imag, _phase_imag); - -//(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); -//} - -//for (n = neon_iters * 4; n < num_points; n++) -//{ -//tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); -//tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); -//tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); -//(*phase) *= phase_inc; -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//tmp = tmp16_ * in_a[n_vec][n]; -//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); -//} -//} -//} - -//#endif [> LV_HAVE_NEONV7 <] - - -//#ifdef LV_HAVE_NEONV7 -//#include -//#include - -//static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) -//{ -//const unsigned int neon_iters = num_points / 4; - -//const int16_t** _in_a = in_a; -//const lv_16sc_t* _in_common = in_common; -//lv_16sc_t* _out = result; -//int n_vec; -//int i; -//unsigned int number; -//unsigned int n; -//lv_16sc_t tmp16_, tmp; -//lv_32fc_t tmp32_; - -//if (neon_iters > 0) -//{ -//lv_16sc_t dotProduct = lv_cmake(0,0); -//float arg_phase0 = cargf(*phase); -//float arg_phase_inc = cargf(phase_inc); -//float phase_est; - -//lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; -//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; -//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; - -//float32x4_t _phase4_real = vld1q_f32(__phase4_real); -//float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - -//lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; -//lv_32fc_t phase3 = phase2 * phase_inc; -//lv_32fc_t phase4 = phase3 * phase_inc; - -//__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; -//__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - -//float32x4_t _phase_real = vld1q_f32(__phase_real); -//float32x4_t _phase_imag = vld1q_f32(__phase_imag); - -//int16x4x2_t a_val, b_val; -//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; -//float32x4_t half = vdupq_n_f32(0.5f); -//int32x4x2_t tmp32i; - -//float32x4x2_t tmp32f, tmp32_real, tmp32_imag; -//float32x4_t sign, PlusHalf, Round; - -//int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); -//int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - -//for(n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//accumulator1[n_vec].val[0] = vdup_n_s16(0); -//accumulator1[n_vec].val[1] = vdup_n_s16(0); -//accumulator2[n_vec].val[0] = vdup_n_s16(0); -//accumulator2[n_vec].val[1] = vdup_n_s16(0); -//} - -//for(number = 0; number < neon_iters; number++) -//{ -//[> load 4 complex numbers (int 16 bits each component) <] -//b_val = vld2_s16((int16_t*)_in_common); -//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); -//_in_common += 4; - -//[> promote them to int 32 bits <] -//tmp32i.val[0] = vmovl_s16(b_val.val[0]); -//tmp32i.val[1] = vmovl_s16(b_val.val[1]); - -//[> promote them to float 32 bits <] -//tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); -//tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); - -//[> complex multiplication of four complex samples (float 32 bits each component) <] -//tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); -//tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); -//tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); -//tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); - -//tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); -//tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - -//[> downcast results to int32 <] -//[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] -//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); -//PlusHalf = vaddq_f32(tmp32f.val[0], half); -//Round = vsubq_f32(PlusHalf, sign); -//tmp32i.val[0] = vcvtq_s32_f32(Round); - -//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); -//PlusHalf = vaddq_f32(tmp32f.val[1], half); -//Round = vsubq_f32(PlusHalf, sign); -//tmp32i.val[1] = vcvtq_s32_f32(Round); - -//[> downcast results to int16 <] -//b_val.val[0] = vqmovn_s32(tmp32i.val[0]); -//b_val.val[1] = vqmovn_s32(tmp32i.val[1]); - -//[> compute next four phases <] -//tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); -//tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); -//tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); -//tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); - -//_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); -//_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - -//// Regenerate phase -//if ((number % 256) == 0) -//{ -////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); -//phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; -////printf("Estimated phase: %f\n\n", cos(phase_est)); - -//*phase = lv_cmake(cos(phase_est), sin(phase_est)); -//phase2 = (lv_32fc_t)(*phase) * phase_inc; -//phase3 = phase2 * phase_inc; -//phase4 = phase3 * phase_inc; - -//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; -//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - -//_phase_real = vld1q_f32(____phase_real); -//_phase_imag = vld1q_f32(____phase_imag); -//} - -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); - -//// use 2 accumulators to remove inter-instruction data dependencies -//accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); -//accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); -//accumulator2[n_vec].val[0] = vmls_s16(accumulator2[n_vec].val[0], a_val.val[1], b_val.val[1]); -//accumulator2[n_vec].val[1] = vmla_s16(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]); -//} -//} -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//accumulator1[n_vec].val[0] = vqadd_s16(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]); -//accumulator1[n_vec].val[1] = vqadd_s16(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]); -//} -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector -//dotProduct = lv_cmake(0,0); -//for (i = 0; i < 4; ++i) -//{ -//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), -//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); -//} -//_out[n_vec] = dotProduct; -//} -//volk_gnsssdr_free(accumulator1); -//volk_gnsssdr_free(accumulator2); - -//vst1q_f32((float32_t*)__phase_real, _phase_real); -//vst1q_f32((float32_t*)__phase_imag, _phase_imag); - -//(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); -//} - -//for (n = neon_iters * 4; n < num_points; n++) -//{ -//tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); -//tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); -//tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); -//(*phase) *= phase_inc; -//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) -//{ -//tmp = tmp16_ * in_a[n_vec][n]; -//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); -//} -//} -//} - -//#endif [> LV_HAVE_NEONV7 <] - -#endif /*INCLUDED_volk_gnsssdr_16ic_16i_dot_prod_16ic_xn_H*/ +#endif /* INCLUDED_volk_gnsssdr_16ic_16i_dot_prod_16ic_xn_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h index 70ad1f6b6..436988087 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h @@ -131,36 +131,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_ #endif // SSE3 -//#ifdef LV_HAVE_SSE3 -//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) -//{ -//// phases must be normalized. Phase rotator expects a complex exponential input! -//float rem_carrier_phase_in_rad = 0.345; -//float phase_step_rad = 0.1; -//lv_32fc_t phase[1]; -//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); -//lv_32fc_t phase_inc[1]; -//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); -//int n; -//int num_a_vectors = 3; -//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); -//for(n = 0; n < num_a_vectors; n++) -//{ -//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); -//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); -//} - -//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - -//for(n = 0; n < num_a_vectors; n++) -//{ -//volk_gnsssdr_free(in_a[n]); -//} -//volk_gnsssdr_free(in_a); -//} - -//#endif // SSE3 - #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) @@ -224,36 +194,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_ #endif // AVX2 -//#ifdef LV_HAVE_AVX2 -//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) -//{ -//// phases must be normalized. Phase rotator expects a complex exponential input! -//float rem_carrier_phase_in_rad = 0.345; -//float phase_step_rad = 0.1; -//lv_32fc_t phase[1]; -//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); -//lv_32fc_t phase_inc[1]; -//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); -//int n; -//int num_a_vectors = 3; -//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); -//for(n = 0; n < num_a_vectors; n++) -//{ -//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); -//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); -//} - -//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - -//for(n = 0; n < num_a_vectors; n++) -//{ -//volk_gnsssdr_free(in_a[n]); -//} -//volk_gnsssdr_free(in_a); -//} - -//#endif // AVX2 - #ifdef LV_HAVE_AVX2 static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) @@ -286,96 +226,4 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_ #endif // AVX2 -//#ifdef LV_HAVE_AVX2 -//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) -//{ -//// phases must be normalized. Phase rotator expects a complex exponential input! -//float rem_carrier_phase_in_rad = 0.345; -//float phase_step_rad = 0.1; -//lv_32fc_t phase[1]; -//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); -//lv_32fc_t phase_inc[1]; -//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); -//int n; -//int num_a_vectors = 3; -//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); -//for(n = 0; n < num_a_vectors; n++) -//{ -//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); -//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); -//} - -//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - -//for(n = 0; n < num_a_vectors; n++) -//{ -//volk_gnsssdr_free(in_a[n]); -//} -//volk_gnsssdr_free(in_a); -//} - -//#endif // AVX2 - - -//#ifdef LV_HAVE_NEONV7 -//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) -//{ -//// phases must be normalized. Phase rotator expects a complex exponential input! -//float rem_carrier_phase_in_rad = 0.345; -//float phase_step_rad = 0.1; -//lv_32fc_t phase[1]; -//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); -//lv_32fc_t phase_inc[1]; -//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); -//int n; -//int num_a_vectors = 3; -//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); -//for(n = 0; n < num_a_vectors; n++) -//{ -//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); -//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); -//} - -//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - -//for(n = 0; n < num_a_vectors; n++) -//{ -//volk_gnsssdr_free(in_a[n]); -//} -//volk_gnsssdr_free(in_a); -//} - -//#endif // NEON - - -//#ifdef LV_HAVE_NEONV7 -//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) -//{ -//// phases must be normalized. Phase rotator expects a complex exponential input! -//float rem_carrier_phase_in_rad = 0.345; -//float phase_step_rad = 0.1; -//lv_32fc_t phase[1]; -//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); -//lv_32fc_t phase_inc[1]; -//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); -//int n; -//int num_a_vectors = 3; -//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); -//for(n = 0; n < num_a_vectors; n++) -//{ -//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); -//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); -//} - -//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - -//for(n = 0; n < num_a_vectors; n++) -//{ -//volk_gnsssdr_free(in_a[n]); -//} -//volk_gnsssdr_free(in_a); -//} - -//#endif // NEON - #endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h index 7ff27cccf..1d2188bc1 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h @@ -200,34 +200,4 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c } #endif /* LV_HAVE_AVX2 */ -// -// -//#ifdef LV_HAVE_NEONV7 -//#include -// -//static inline void volk_gnsssdr_16ic_conjugate_16ic_neon(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) -//{ -// const unsigned int sse_iters = num_points / 4; -// unsigned int i; -// lv_16sc_t* c = cVector; -// const lv_16sc_t* a = aVector; -// int16x4x2_t a_val; -// -// for (i = 0; i < sse_iters; ++i) -// { -// a_val = vld2_s16((const int16_t*)a); -// __VOLK_GNSSSDR_PREFETCH(a + 4); -// a_val.val[1] = vneg_s16(a_val.val[1]); -// vst2_s16((int16_t*)c, a_val); -// a += 4; -// c += 4; -// } -// -// for (i = sse_iters * 4; i < num_points; ++i) -// { -// *c++ = lv_conj(*a++); -// } -//} -//#endif /* LV_HAVE_NEONV7 */ - #endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h index 2841d2f18..40c5bf356 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h @@ -72,7 +72,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu { int local_code_chip_index; unsigned int n; - //fesetround(FE_TONEAREST); + // fesetround(FE_TONEAREST); for (n = 0; n < num_output_samples; n++) { // resample code for current tap @@ -83,7 +83,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE2 @@ -91,7 +91,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) { - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; @@ -104,8 +104,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul __m128 _code_phase_out, _code_phase_out_with_offset; rem_code_phase_chips = rem_code_phase_chips - 0.5f; - _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); // load float to all four float values in m128 register + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; @@ -120,8 +120,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; @@ -136,21 +136,21 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul for (number = 0; number < quarterPoints; number++) { - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back - //todo: optimize the local code lookup table with intrinsics, if possible + // todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[1]]; *_result++ = local_code[local_code_chip_index[2]]; @@ -176,7 +176,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) { - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; @@ -189,8 +189,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul __m128 _code_phase_out, _code_phase_out_with_offset; rem_code_phase_chips = rem_code_phase_chips - 0.5f; - _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); // load float to all four float values in m128 register + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; @@ -205,8 +205,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; @@ -221,21 +221,21 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul for (number = 0; number < quarterPoints; number++) { - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back - //todo: optimize the local code lookup table with intrinsics, if possible + // todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[1]]; *_result++ = local_code[local_code_chip_index[2]]; @@ -275,8 +275,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, rem_code_phase_chips = rem_code_phase_chips - 0.5f; float32x4_t sign, PlusHalf, Round; - _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register + _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); // load float to all four float values in m128 register + _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); // load float to all four float values in m128 register __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; @@ -291,8 +291,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); // load float to all four float values in m128 register + _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; uint32x4_t negative_indexes, overflow_indexes; @@ -307,24 +307,24 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, for (number = 0; number < quarterPoints; number++) { - _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step - _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step + _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); // add the phase offset sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); Round = vsubq_f32(PlusHalf, sign); _code_phase_out_int = vcvtq_s32_f32(Round); - negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch + negative_indexes = vcltq_s32(_code_phase_out_int, zero); // test for negative values + _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); // the negative values branch _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values + _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back - //todo: optimize the local code lookup table with intrinsics, if possible + // todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[1]]; *_result++ = local_code[local_code_chip_index[2]]; @@ -344,4 +344,4 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, #endif /* LV_HAVE_NEONV7 */ -#endif /*INCLUDED_volk_gnsssdr_16ic_resampler_fast_16ic_H*/ +#endif /* INCLUDED_volk_gnsssdr_16ic_resampler_fast_16ic_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h index e18746558..8f1e267ae 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h @@ -79,13 +79,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou // Regenerate phase if (i % 512 == 0) { - //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif - //printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); } } } @@ -112,13 +112,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s (*phase) *= phase_inc; } // Regenerate phase - //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif - //printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); } for (j = 0; j < num_points % ROTATOR_RELOAD; j++) { @@ -161,8 +161,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out for (number = 0; number < sse_iters; number++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -171,7 +171,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -179,11 +179,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -192,7 +192,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -214,7 +214,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); } - //next two samples + // next two samples _in += 2; _out += 4; } @@ -268,8 +268,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc { for (j = 0; j < ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -278,7 +278,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -286,11 +286,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -299,7 +299,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -311,7 +311,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_store_si128((__m128i*)_out, result); - //next two samples + // next two samples _in += 2; _out += 4; } @@ -325,8 +325,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -335,7 +335,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -343,11 +343,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -356,7 +356,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -368,7 +368,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_store_si128((__m128i*)_out, result); - //next two samples + // next two samples _in += 2; _out += 4; } @@ -418,8 +418,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out for (number = 0; number < sse_iters; number++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -428,7 +428,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -436,11 +436,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -449,7 +449,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -471,7 +471,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); } - //next two samples + // next two samples _in += 2; _out += 4; } @@ -525,8 +525,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc { for (j = 0; j < ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -535,7 +535,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -543,11 +543,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -556,7 +556,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -568,7 +568,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_storeu_si128((__m128i*)_out, result); - //next two samples + // next two samples _in += 2; _out += 4; } @@ -582,8 +582,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -592,7 +592,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -600,11 +600,11 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -613,7 +613,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -625,7 +625,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_storeu_si128((__m128i*)_out, result); - //next two samples + // next two samples _in += 2; _out += 4; } @@ -746,9 +746,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe // Regenerate phase if ((i % 512) == 0) { - //printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); + // printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); phase_est = arg_phase0 + (i + 1) * 4 * arg_phase_inc; - //printf("Estimated phase: %f\n\n", cos(phase_est)); + // printf("Estimated phase: %f\n\n", cos(phase_est)); *phase = lv_cmake(cos(phase_est), sin(phase_est)); phase2 = (lv_32fc_t)(*phase) * phase_inc; @@ -887,9 +887,9 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t _out += 4; } // Regenerate phase - //printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); + // printf("Computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); phase_est = arg_phase0 + (n + 1) * ROTATOR_RELOAD * 4 * arg_phase_inc; - //printf("Estimated phase: %f\n\n", cos(phase_est)); + // printf("Estimated phase: %f\n\n", cos(phase_est)); *phase = lv_cmake(cos(phase_est), sin(phase_est)); phase2 = (lv_32fc_t)(*phase) * phase_inc; phase3 = phase2 * phase_inc; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h index e236e51b7..e6afe3cc2 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h @@ -28,7 +28,7 @@ * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with GNSS-SDR. If not, see . + * along with GNSS-SDR. If not, see . * * ------------------------------------------------------------------------- */ @@ -77,7 +77,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE2 @@ -108,7 +108,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con for (number = 0; number < sse_iters; number++) { // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_a + 8); b = _mm_load_si128((__m128i*)_in_b); __VOLK_GNSSSDR_PREFETCH(_in_b + 8); @@ -123,7 +123,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! realcacc = _mm_adds_epi16(realcacc, real); imagcacc = _mm_adds_epi16(imagcacc, imag); @@ -186,10 +186,10 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con for (number = 0; number < sse_iters; number++) { - //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] - //imaginary part -> reinterpret_cast(a)[2*i + 1] + // std::complex memory structure: real part -> reinterpret_cast(a)[2*i] + // imaginary part -> reinterpret_cast(a)[2*i + 1] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_a + 8); b = _mm_loadu_si128((__m128i*)_in_b); __VOLK_GNSSSDR_PREFETCH(_in_b + 8); @@ -204,7 +204,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm_adds_epi16(imag1, imag2); // with saturation arithmetic! realcacc = _mm_adds_epi16(realcacc, real); imagcacc = _mm_adds_epi16(imagcacc, imag); @@ -281,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con imag1 = _mm256_mullo_epi16(a, b_sl); imag2 = _mm256_mullo_epi16(b, a_sl); - imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! realcacc = _mm256_adds_epi16(realcacc, real); imagcacc = _mm256_adds_epi16(imagcacc, imag); @@ -359,7 +359,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con imag1 = _mm256_mullo_epi16(a, b_sl); imag2 = _mm256_mullo_epi16(b, a_sl); - imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm256_adds_epi16(imag1, imag2); // with saturation arithmetic! realcacc = _mm256_adds_epi16(realcacc, real); imagcacc = _mm256_adds_epi16(imagcacc, imag); @@ -571,4 +571,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out #endif /* LV_HAVE_NEONV7 */ -#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H*/ +#endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h index bdd62a958..00dfe9143 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h @@ -77,15 +77,15 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu result[n_vec] = lv_cmake(0, 0); for (n = 0; n < num_points; n++) { - //r*a.r - i*a.i, i*a.r + r*a.i - //result[n_vec]+=in_common[n]*in_a[n_vec][n]; + // r*a.r - i*a.i, i*a.r + r*a.i + // result[n_vec]+=in_common[n]*in_a[n_vec][n]; lv_16sc_t tmp = in_common[n] * in_a[n_vec][n]; result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC @@ -106,7 +106,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t* } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE2 @@ -145,11 +145,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul for (index = 0; index < sse_iters; index++) { // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_load_si128((__m128i*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... @@ -240,11 +240,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul for (index = 0; index < sse_iters; index++) { // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_loadu_si128((__m128i*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... @@ -522,12 +522,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, for (index = 0; index < neon_iters; index++) { - b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b_val = vld2_s16((int16_t*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - //__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg + // __VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r @@ -609,7 +609,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res for (index = 0; index < neon_iters; index++) { - b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b_val = vld2_s16((int16_t*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { @@ -690,7 +690,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* for (index = 0; index < neon_iters; index++) { - b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b_val = vld2_s16((int16_t*)_in_common); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { @@ -738,4 +738,4 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* } #endif /* LV_HAVE_NEONV7 */ -#endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H*/ +#endif /* INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h index 73dacf270..9f875ae0a 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h @@ -68,12 +68,12 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_generic(lv_16sc_t* result, unsigned int n; for (n = 0; n < num_points; n++) { - //r*a.r - i*a.i, i*a.r + r*a.i + // r*a.r - i*a.i, i*a.r + r*a.i result[n] = in_a[n] * in_b[n]; } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE2 @@ -93,10 +93,10 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con lv_16sc_t* _out = out; for (number = 0; number < sse_iters; number++) { - //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] - //imaginary part -> reinterpret_cast(a)[2*i + 1] + // std::complex memory structure: real part -> reinterpret_cast(a)[2*i] + // imaginary part -> reinterpret_cast(a)[2*i + 1] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_load_si128((__m128i*)_in_b); c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... @@ -147,10 +147,10 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con lv_16sc_t* _out = out; for (number = 0; number < sse_iters; number++) { - //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] - //imaginary part -> reinterpret_cast(a)[2*i + 1] + // std::complex memory structure: real part -> reinterpret_cast(a)[2*i] + // imaginary part -> reinterpret_cast(a)[2*i + 1] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_loadu_si128((__m128i*)_in_b); c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... @@ -340,4 +340,4 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const } #endif /* LV_HAVE_NEONV7*/ -#endif /*INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H*/ +#endif /* INCLUDED_volk_gnsssdr_16ic_x2_multiply_16ic_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h index 05c9a019a..9143dcf0a 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h @@ -89,33 +89,33 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc } for (n = 0; n < num_points; n++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); // Regenerate phase if (n % 256 == 0) { - //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif - //printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); } (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC @@ -137,14 +137,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( { for (j = 0; j < ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][n * ROTATOR_RELOAD + j]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } @@ -152,27 +152,27 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else - //(*phase) /= cabsf((*phase)); + // (*phase) /= cabsf((*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif } for (j = 0; j < num_points % ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE3 @@ -228,9 +228,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: - //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + // printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -239,7 +239,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -247,11 +247,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -260,7 +260,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -271,12 +271,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ // store four rotated in_common samples in the register b b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic - //next two samples + // next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... @@ -331,12 +331,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); - //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); + // (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; for (n = sse_iters * 4; n < num_points; n++) { - tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = in_common[n]; // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; @@ -344,7 +344,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } @@ -411,9 +411,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l for (j = 0; j < ROTATOR_RELOAD; j++) { // Phase rotation on operand in_common starts here: - //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + // printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -422,7 +422,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -430,11 +430,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -443,7 +443,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -454,12 +454,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l // store four rotated in_common samples in the register b b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic - //next two samples + // next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... @@ -488,8 +488,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) { - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -498,7 +498,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -506,11 +506,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -519,7 +519,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -530,12 +530,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l // store four rotated in_common samples in the register b b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic - //next two samples + // next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... @@ -582,12 +582,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); - //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); + // (*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; for (n = sse_iters * 4; n < num_points; n++) { - tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = in_common[n]; // printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; @@ -595,7 +595,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; - //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); + // lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } @@ -657,9 +657,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -668,7 +668,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -676,11 +676,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -689,7 +689,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -700,12 +700,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ // store four rotated in_common samples in the register b b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic - //next two samples + // next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... @@ -824,8 +824,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ for (number = 0; number < avx2_iters; number++) { - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -834,7 +834,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -842,11 +842,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -855,7 +855,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -866,8 +866,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ // store four output samples result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -876,7 +876,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -884,11 +884,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 16); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -897,7 +897,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1036,8 +1036,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l { for (j = 0; j < ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1046,7 +1046,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1054,11 +1054,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1067,7 +1067,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1078,8 +1078,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l // store four output samples result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1088,7 +1088,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1096,11 +1096,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 16); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1109,7 +1109,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1152,8 +1152,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1162,7 +1162,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1170,11 +1170,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1183,7 +1183,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1194,8 +1194,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l // store four output samples result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1204,7 +1204,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1212,11 +1212,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples + // next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // // load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 16); - //complex 32fc multiplication b=a*two_phase_acc_reg + // complex 32fc multiplication b=a*two_phase_acc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1225,7 +1225,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + // complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr @@ -1414,8 +1414,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - //__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg + // __VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); // multiply the real*real and imag*imag to get real result // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r @@ -1474,7 +1474,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* for (n = neon_iters * 4; n < num_points; n++) { - tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16_ = in_common[n]; // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); (*phase) *= phase_inc; @@ -1513,7 +1513,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s float arg_phase0 = cargf(*phase); float arg_phase_inc = cargf(phase_inc); float phase_est; - //printf("arg phase0: %f", arg_phase0); + // printf("arg phase0: %f", arg_phase0); lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; @@ -1605,9 +1605,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s // Regenerate phase if ((number % 256) == 0) { - //printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); + // printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; - //printf("Estimated phase: %f\n\n", cos(phase_est)); + // printf("Estimated phase: %f\n\n", cos(phase_est)); *phase = lv_cmake(cos(phase_est), sin(phase_est)); phase2 = (lv_32fc_t)(*phase) * phase_inc; @@ -1624,14 +1624,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s // Round = vmulq_f32(_phase_real, _phase_real); // Round = vmlaq_f32(Round, _phase_imag, _phase_imag); - // Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); - //Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); - //Round = vrecpeq_f32((Round); + // Round = vsqrtq_f32(Round);// printf("sqrt: %f \n", Round[0]); + // Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); + // Round = vrecpeq_f32((Round); // _phase_real = vdivq_f32(_phase_real, Round); // _phase_imag = vdivq_f32(_phase_imag, Round); - //_phase_real = vmulq_f32(_phase_real, Round); - //_phase_imag = vmulq_f32(_phase_imag, Round); - //printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); + // _phase_real = vmulq_f32(_phase_real, Round); + // _phase_imag = vmulq_f32(_phase_imag, Round); + // printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); } for (n_vec = 0; n_vec < num_a_vectors; n_vec++) @@ -1671,7 +1671,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s for (n = neon_iters * 4; n < num_points; n++) { - tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16_ = in_common[n]; // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); (*phase) *= phase_inc; @@ -1804,9 +1804,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ // Regenerate phase if ((number % 256) == 0) { - //printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); + // printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; - //printf("Estimated phase: %f\n\n", cos(phase_est)); + // printf("Estimated phase: %f\n\n", cos(phase_est)); *phase = lv_cmake(cos(phase_est), sin(phase_est)); phase2 = (lv_32fc_t)(*phase) * phase_inc; @@ -1860,7 +1860,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ for (n = neon_iters * 4; n < num_points; n++) { - tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16_ = in_common[n]; // printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); (*phase) *= phase_inc; @@ -1874,4 +1874,4 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ #endif /* LV_HAVE_NEONV7 */ -#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H*/ +#endif /* INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h index 4509d096d..36f997cc8 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h @@ -82,7 +82,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re { // resample code for current tap local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); local_code_chip_index = local_code_chip_index % code_length_chips; result[current_correlator_tap][n] = local_code[local_code_chip_index]; @@ -90,7 +90,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** re } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE4_1 @@ -149,7 +149,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -216,7 +216,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -287,7 +287,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -358,7 +358,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -436,7 +436,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -514,7 +514,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -567,7 +567,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul aux = vmulq_f32(code_phase_step_chips_reg, indexn); aux = vaddq_f32(aux, aux2); - //floor + // floor i = vcvtq_s32_f32(aux); fi = vcvtq_f32_s32(i); igx = vcgtq_f32(fi, aux); @@ -599,7 +599,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -611,4 +611,4 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul #endif -#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/ +#endif /* INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h index e79af11b2..187cca54e 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h @@ -73,7 +73,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { int local_code_chip_index; - //fesetround(FE_TONEAREST); + // fesetround(FE_TONEAREST); int current_vector; unsigned int n; for (current_vector = 0; current_vector < num_out_vectors; current_vector++) @@ -89,7 +89,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE2 @@ -97,7 +97,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; @@ -109,7 +109,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* __m128i _code_length_chips, _code_length_chips_minus1; __m128 _code_phase_out, _code_phase_out_with_offset; - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; @@ -124,8 +124,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; @@ -142,29 +142,29 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* int sample_idx = 0; for (number = 0; number < quarterPoints; number++) { - //common to all outputs - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + // common to all outputs + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step - //output vector dependent (different code phase offset) + // output vector dependent (different code phase offset) for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) - _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register + _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); // load float to all four float values in m128 register - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back - //todo: optimize the local code lookup table with intrinsics, if possible + // todo: optimize the local code lookup table with intrinsics, if possible _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; @@ -193,7 +193,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); // _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; @@ -205,7 +205,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* __m128i _code_length_chips, _code_length_chips_minus1; __m128 _code_phase_out, _code_phase_out_with_offset; - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); // load float to all four float values in m128 register __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; @@ -220,8 +220,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); // load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); // load float to all four float values in m128 register __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; @@ -238,29 +238,29 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* int sample_idx = 0; for (number = 0; number < quarterPoints; number++) { - //common to all outputs - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + // common to all outputs + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step - //output vector dependent (different code phase offset) + // output vector dependent (different code phase offset) for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) - _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register + _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); // load float to all four float values in m128 register - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); // add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); // test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); // the negative values branch _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back - //todo: optimize the local code lookup table with intrinsics, if possible + // todo: optimize the local code lookup table with intrinsics, if possible _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; @@ -303,7 +303,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** float32x4_t _code_phase_out, _code_phase_out_with_offset; float32x4_t sign, PlusHalf, Round; - _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register + _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); // load float to all four float values in float32x4_t register __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; @@ -318,8 +318,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register - _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register + _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); // load float to all four float values in float32x4_t register + _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); // load float to all four float values in float32x4_t register int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; uint32x4_t negative_indexes, overflow_indexes; @@ -336,33 +336,33 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** int sample_idx = 0; for (number = 0; number < quarterPoints; number++) { - //common to all outputs - _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + // common to all outputs + _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); // compute the code phase point with the phase step - //output vector dependent (different code phase offset) + // output vector dependent (different code phase offset) for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) - _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register + _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); // load float to all four float values in float32x4_t register - _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset - //_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); // add the phase offset + // _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); // convert to integer sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); Round = vsubq_f32(PlusHalf, sign); _code_phase_out_int = vcvtq_s32_f32(Round); - negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch + negative_indexes = vcltq_s32(_code_phase_out_int, zero); // test for negative values + _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); // the negative values branch _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); // test for overflow values + _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); // the negative values branch _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back - //todo: optimize the local code lookup table with intrinsics, if possible + // todo: optimize the local code lookup table with intrinsics, if possible _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; _result[current_vector][sample_idx + 1] = local_code[local_code_chip_index[1]]; _result[current_vector][sample_idx + 2] = local_code[local_code_chip_index[2]]; @@ -386,4 +386,4 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** #endif /* LV_HAVE_NEONV7 */ -#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_H*/ +#endif /* INCLUDED_volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_high_dynamics_resamplerxnpuppet_32f.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_high_dynamics_resamplerxnpuppet_32f.h index 3735cb47c..601665084 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_high_dynamics_resamplerxnpuppet_32f.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_high_dynamics_resamplerxnpuppet_32f.h @@ -256,33 +256,6 @@ static inline void volk_gnsssdr_32f_high_dynamics_resamplerxnpuppet_32f_u_avx(fl volk_gnsssdr_free(result_aux); } #endif -// -//#ifdef LV_HAVE_NEONV7 -//static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points) -//{ -// int code_length_chips = 2046; -// float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); -// int num_out_vectors = 3; -// float rem_code_phase_chips = -0.234; -// int n; -// float shifts_chips[3] = {-0.1, 0.0, 0.1}; -// -// float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); -// for (n = 0; n < num_out_vectors; n++) -// { -// result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); -// } -// -// volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); -// -// memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); -// -// for (n = 0; n < num_out_vectors; n++) -// { -// volk_gnsssdr_free(result_aux[n]); -// } -// volk_gnsssdr_free(result_aux); -//} -//#endif + #endif // INCLUDED_volk_gnsssdr_32f_high_dynamics_resamplerpuppet_32f_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h index 9c7f01227..4bd54c1be 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h @@ -52,6 +52,8 @@ * \b Outputs * \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n])) * + * Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier + * Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ #ifndef INCLUDED_volk_gnsssdr_32f_sincos_32fc_H @@ -251,8 +253,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f #ifdef LV_HAVE_SSE2 #include -/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ -/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ + static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float* in, unsigned int num_points) { lv_32fc_t* bPtr = out; @@ -421,8 +422,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo #ifdef LV_HAVE_SSE2 #include -/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ -/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ + static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float* in, unsigned int num_points) { lv_32fc_t* bPtr = out; @@ -644,8 +644,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con #ifdef LV_HAVE_NEONV7 #include -/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */ -/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ + static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float* in, unsigned int num_points) { lv_32fc_t* bPtr = out; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn.h index 7c9d6fd7c..284ef14e2 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn.h @@ -83,7 +83,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_generic(fl int local_code_chip_index; int current_correlator_tap; unsigned int n; - //first correlator + // first correlator for (n = 0; n < num_points; n++) { // resample code for first tap @@ -94,7 +94,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_generic(fl result[0][n] = local_code[local_code_chip_index]; } - //adjacent correlators + // adjacent correlators unsigned int shift_samples = 0; for (current_correlator_tap = 1; current_correlator_tap < num_out_vectors; current_correlator_tap++) { @@ -618,11 +618,11 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa #endif // // -//#ifdef LV_HAVE_NEONV7 -//#include +// #ifdef LV_HAVE_NEONV7 +// #include // -//static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) -//{ +// static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_neon(float** result, const float* local_code, float rem_code_phase_chips, float code_phase_step_chips, float* shifts_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_points) +// { // float** _result = result; // const unsigned int neon_iters = num_points / 4; // int current_correlator_tap; @@ -662,7 +662,7 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa // aux = vmulq_f32(code_phase_step_chips_reg, indexn); // aux = vaddq_f32(aux, aux2); // -// //floor +// // floor // i = vcvtq_s32_f32(aux); // fi = vcvtq_f32_s32(i); // igx = vcgtq_f32(fi, aux); @@ -700,8 +700,8 @@ static inline void volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_u_avx(floa // _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; // } // } -//} +// } // -//#endif +// #endif -#endif /*INCLUDED_volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_H*/ +#endif /* INCLUDED_volk_gnsssdr_32f_xn_high_dynamics_resampler_32f_xn_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h index d6a1eb094..77f7bd2cb 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h @@ -85,7 +85,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result, { // resample code for current tap local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); local_code_chip_index = local_code_chip_index % code_length_chips; result[current_correlator_tap][n] = local_code[local_code_chip_index]; @@ -93,7 +93,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_generic(float** result, } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE3 @@ -156,7 +156,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -227,7 +227,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -293,7 +293,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -360,7 +360,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -438,7 +438,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -516,7 +516,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -571,7 +571,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con aux = vmulq_f32(code_phase_step_chips_reg, indexn); aux = vaddq_f32(aux, aux2); - //floor + // floor i = vcvtq_s32_f32(aux); fi = vcvtq_f32_s32(i); igx = vcgtq_f32(fi, aux); @@ -603,7 +603,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h index f9587d06e..9503792c0 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h @@ -74,7 +74,7 @@ #include #include #include -//#include +// #include #ifdef LV_HAVE_GENERIC @@ -89,18 +89,18 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f } for (n = 0; n < num_points; n++) { - tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32_1 = *in_common++ * (*phase); // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); // Regenerate phase if (n % 256 == 0) { - //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif - //printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); } (*phase) *= phase_inc; @@ -112,7 +112,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC @@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else - //(*phase) /= cabsf((*phase)); + // (*phase) /= cabsf((*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif } @@ -162,7 +162,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_AVX #include diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h index c834e032e..047ea6a92 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h @@ -179,7 +179,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector int16_t* outputVectorPtr = (int16_t*)outputVector; float aux; unsigned int i; - const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast + const float min_val = (float)SHRT_MIN; /// todo Something off here, compiler does not perform right cast const float max_val = (float)SHRT_MAX; __m256 inputVal1, inputVal2; @@ -342,7 +342,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector int16_t* outputVectorPtr = (int16_t*)outputVector; float aux; unsigned int i; - const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast + const float min_val = (float)SHRT_MIN; /// todo Something off here, compiler does not perform right cast const float max_val = (float)SHRT_MAX; __m256 inputVal1, inputVal2; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h index 0d1331bb5..b3f29415f 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h @@ -89,18 +89,18 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc } for (n = 0; n < num_points; n++) { - tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32_1 = *in_common++ * (*phase); // if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); // Regenerate phase if (n % 256 == 0) { - //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif - //printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); } (*phase) *= phase_inc; @@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else - //(*phase) /= cabsf((*phase)); + // (*phase) /= cabsf((*phase)); (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); #endif } @@ -225,7 +225,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(z1); - //next two samples + // next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) @@ -337,7 +337,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(z1); - //next two samples + // next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) @@ -457,7 +457,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(z); - //next two samples + // next two samples _in_common += 4; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) @@ -587,7 +587,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(z); - //next two samples + // next two samples _in_common += 4; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h index 133bdbb80..9b8f3a27d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h @@ -82,7 +82,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** re { // resample code for current tap local_code_chip_index = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index < 0) local_code_chip_index += (int)code_length_chips * (abs(local_code_chip_index) / code_length_chips + 1); local_code_chip_index = local_code_chip_index % code_length_chips; result[current_correlator_tap][n] = local_code[local_code_chip_index]; @@ -90,7 +90,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(lv_32fc_t** re } } -#endif /*LV_HAVE_GENERIC*/ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE3 @@ -153,7 +153,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -224,7 +224,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -290,7 +290,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -357,7 +357,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -435,7 +435,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -513,7 +513,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -558,13 +558,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); - //aux = _mm256_add_ps(aux, aux2); + // aux = _mm256_add_ps(aux, aux2); // floor aux = _mm256_floor_ps(aux); // fmod c = _mm256_div_ps(aux, code_length_chips_reg_f); - //_mm_fmsub_ps(c, code_length_chips_reg_f, aux) + // _mm_fmsub_ps(c, code_length_chips_reg_f, aux) i = _mm256_cvttps_epi32(c); cTrunc = _mm256_cvtepi32_ps(i); base = _mm256_fnmadd_ps(cTrunc, code_length_chips_reg_f, aux); @@ -592,7 +592,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -637,8 +637,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); aux = _mm256_fmadd_ps(code_phase_step_chips_reg, indexn, aux2); - //aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); - //aux = _mm256_add_ps(aux, aux2); + // aux = _mm256_mul_ps(code_phase_step_chips_reg, indexn); + // aux = _mm256_add_ps(aux, aux2); // floor aux = _mm256_floor_ps(aux); @@ -671,7 +671,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -726,7 +726,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul aux = vmulq_f32(code_phase_step_chips_reg, indexn); aux = vaddq_f32(aux, aux2); - //floor + // floor i = vcvtq_s32_f32(aux); fi = vcvtq_f32_s32(i); igx = vcgtq_f32(fi, aux); @@ -758,7 +758,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); - //Take into account that in multitap correlators, the shifts can be negative! + // Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; @@ -769,4 +769,4 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul #endif -#endif /*INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H*/ +#endif /* INCLUDED_volk_gnsssdr_32fc_xn_resampler_32fc_xn_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h index aab393eb8..dd2179ea1 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h @@ -121,7 +121,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co } } -#endif /*LV_HAVE_AVX2*/ +#endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_AVX @@ -156,7 +156,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultshi = _mm_cmpgt_epi8(maxValues, hi); - //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h + // compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); if (!_mm256_testc_si256(compareResults, ones)) @@ -437,7 +437,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultshi = _mm_cmpgt_epi8(maxValues, hi); - //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h + // compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); if (!_mm256_testc_si256(compareResults, ones)) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h index dac863e3b..3422c65b8 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h @@ -313,7 +313,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0 { currentValues = _mm256_load_si256((__m256i*)inputPtr); compareResults = _mm256_max_epi8(maxValues, currentValues); - maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults); + maxValues = compareResults; // _mm256_blendv_epi8(currentValues, maxValues, compareResults); inputPtr += 32; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h index 90471514a..72fb144ed 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h @@ -113,7 +113,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); - //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h + // tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); _mm256_storeu_ps((float*)c, tmp); @@ -230,7 +230,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); - //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h + // tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); _mm256_store_ps((float*)c, tmp); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h index 6a3e928d7..2c4cfa503 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h @@ -114,46 +114,6 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV } #endif /* LV_HAVE_SSSE3 */ -//#ifdef LV_HAVE_SSE -//#include -// -//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ -// unsigned int number = 0; -// const unsigned int quarterPoints = num_points / 4; -// -// const float* complexVectorPtr = (float*)complexVector; -// float* magnitudeVectorPtr = magnitudeVector; -// -// __m128 cplxValue1, cplxValue2, iValue, qValue, result; -// for(;number < quarterPoints; number++){ -// cplxValue1 = _mm_loadu_ps(complexVectorPtr); -// complexVectorPtr += 4; -// -// cplxValue2 = _mm_loadu_ps(complexVectorPtr); -// complexVectorPtr += 4; -// -// // Arrange in i1i2i3i4 format -// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); -// // Arrange in q1q2q3q4 format -// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); -// -// iValue = _mm_mul_ps(iValue, iValue); // Square the I values -// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values -// -// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values -// -// _mm_storeu_ps(magnitudeVectorPtr, result); -// magnitudeVectorPtr += 4; -// } -// -// number = quarterPoints * 4; -// for(; number < num_points; number++){ -// float val1Real = *complexVectorPtr++; -// float val1Imag = *complexVectorPtr++; -// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); -// } -//} -//#endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_GENERIC @@ -228,46 +188,6 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV } #endif /* LV_HAVE_SSSE3 */ -//#ifdef LV_HAVE_SSE -//#include -// -//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ -// unsigned int number = 0; -// const unsigned int quarterPoints = num_points / 4; -// -// const float* complexVectorPtr = (float*)complexVector; -// float* magnitudeVectorPtr = magnitudeVector; -// -// __m128 cplxValue1, cplxValue2, iValue, qValue, result; -// for(;number < quarterPoints; number++){ -// cplxValue1 = _mm_load_ps(complexVectorPtr); -// complexVectorPtr += 4; -// -// cplxValue2 = _mm_load_ps(complexVectorPtr); -// complexVectorPtr += 4; -// -// // Arrange in i1i2i3i4 format -// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); -// // Arrange in q1q2q3q4 format -// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); -// -// iValue = _mm_mul_ps(iValue, iValue); // Square the I values -// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values -// -// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values -// -// _mm_store_ps(magnitudeVectorPtr, result); -// magnitudeVectorPtr += 4; -// } -// -// number = quarterPoints * 4; -// for(; number < num_points; number++){ -// float val1Real = *complexVectorPtr++; -// float val1Imag = *complexVectorPtr++; -// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); -// } -//} -//#endif /* LV_HAVE_SSE */ #ifdef LV_HAVE_ORC diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h index 41e59e525..740119b8c 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h @@ -56,6 +56,8 @@ * \li out: Vector of the form lv_32fc_t out[n] = lv_cmake(cos(in[n]), sin(in[n])) * \li phase: Pointer to a float containing the final phase, in radians. * + * Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier + * Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ @@ -69,8 +71,7 @@ #ifdef LV_HAVE_SSE2 #include -/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ -/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ + static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { lv_32fc_t *bPtr = out; @@ -225,8 +226,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const fl #ifdef LV_HAVE_SSE2 #include -/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ -/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ + static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { lv_32fc_t *bPtr = out; @@ -459,8 +459,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t *out, co #ifdef LV_HAVE_AVX2 #include -/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ - * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ + static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { lv_32fc_t *bPtr = out; @@ -647,8 +646,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const fl #ifdef LV_HAVE_AVX2 #include -/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ - * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ + static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { lv_32fc_t *bPtr = out; @@ -835,8 +833,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const fl #ifdef LV_HAVE_NEONV7 #include -/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */ -/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ + static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { lv_32fc_t *bPtr = out; From c579aed27c12bc53f5b87e9108bd53a49b27aea5 Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Mon, 19 Aug 2019 12:21:43 +0200 Subject: [PATCH 2/3] Fix portability warnings --- src/core/libs/supl/asn-rrlp/INTEGER.c | 5 +++-- src/core/libs/supl/asn-supl/INTEGER.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/core/libs/supl/asn-rrlp/INTEGER.c b/src/core/libs/supl/asn-rrlp/INTEGER.c index 9d6ff9d28..a3ede3905 100644 --- a/src/core/libs/supl/asn-rrlp/INTEGER.c +++ b/src/core/libs/supl/asn-rrlp/INTEGER.c @@ -7,6 +7,7 @@ #include #include /* Encoder and decoder of a primitive type */ #include +#include /* * INTEGER basic type description. @@ -146,7 +147,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by scr = (char *)alloca(scrsize); if(plainOrXER == 0) ret = snprintf(scr, scrsize, - "%lld (%s)", accum, el->enum_name); + "%+"PRId64"(%s)", accum, el->enum_name); else ret = snprintf(scr, scrsize, "<%s/>", el->enum_name); @@ -160,7 +161,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by scr = scratch; ret = snprintf(scr, scrsize, (specs && specs->field_unsigned) - ?"%llu":"%lld", accum); + ?"%"PRIu64:"%+"PRId64, accum); } assert(ret > 0 && (size_t)ret < scrsize); return (cb(scr, ret, app_key) < 0) ? -1 : ret; diff --git a/src/core/libs/supl/asn-supl/INTEGER.c b/src/core/libs/supl/asn-supl/INTEGER.c index 956b2bc4b..30fb5e8bc 100644 --- a/src/core/libs/supl/asn-supl/INTEGER.c +++ b/src/core/libs/supl/asn-supl/INTEGER.c @@ -7,6 +7,7 @@ #include #include /* Encoder and decoder of a primitive type */ #include +#include /* * INTEGER basic type description. @@ -146,7 +147,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by scr = (char *)alloca(scrsize); if(plainOrXER == 0) ret = snprintf(scr, scrsize, - "%lld (%s)", accum, el->enum_name); + "%+"PRId64"(%s)", accum, el->enum_name); else ret = snprintf(scr, scrsize, "<%s/>", el->enum_name); @@ -160,7 +161,7 @@ INTEGER__dump(asn_TYPE_descriptor_t *td, const INTEGER_t *st, asn_app_consume_by scr = scratch; ret = snprintf(scr, scrsize, (specs && specs->field_unsigned) - ?"%llu":"%lld", accum); + ?"%"PRIu64:"%+"PRId64, accum); } assert(ret > 0 && (size_t)ret < scrsize); return (cb(scr, ret, app_key) < 0) ? -1 : ret; From 730769d5f3f909e20b65093963fd10a53481bd52 Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Mon, 19 Aug 2019 13:11:13 +0200 Subject: [PATCH 3/3] Fix API usage errors --- src/algorithms/libs/rtklib/rtklib_rtkpos.cc | 2 +- src/algorithms/libs/rtklib/rtklib_stream.cc | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/algorithms/libs/rtklib/rtklib_rtkpos.cc b/src/algorithms/libs/rtklib/rtklib_rtkpos.cc index 95a3dd72c..992faf97c 100644 --- a/src/algorithms/libs/rtklib/rtklib_rtkpos.cc +++ b/src/algorithms/libs/rtklib/rtklib_rtkpos.cc @@ -2340,7 +2340,7 @@ int valpos(rtk_t *rtk, const double *v, const double *R, const int *vflg, type = (vflg[i] >> 4) & 0xF; freq = vflg[i] & 0xF; stype = type == 0 ? 'L' : (type == 1 ? 'L' : 'C'); - errmsg(rtk, "large residual (sat=%2d-%2d %s%d v=%6.3f sig=%.3f)\n", + errmsg(rtk, "large residual (sat=%2d-%2d %c%d v=%6.3f sig=%.3f)\n", sat1, sat2, stype, freq + 1, v[i], std::sqrt(R[i + i * nv])); } #if 0 /* omitted v.2.4.0 */ diff --git a/src/algorithms/libs/rtklib/rtklib_stream.cc b/src/algorithms/libs/rtklib/rtklib_stream.cc index c0fc632c2..cf52eaae0 100644 --- a/src/algorithms/libs/rtklib/rtklib_stream.cc +++ b/src/algorithms/libs/rtklib/rtklib_stream.cc @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -461,7 +462,7 @@ void closefile(file_t *file) { return; } - tracet(3, "closefile: fp=%d\n", file->fp); + tracet(3, "closefile: fpos=%" PRIdPTR "\n", file->fp); closefile_(file); free(file); } @@ -472,7 +473,7 @@ void swapfile(file_t *file, gtime_t time, char *msg) { char openpath[MAXSTRPATH]; - tracet(3, "swapfile: fp=%d time=%s\n", file->fp, time_str(time, 0)); + tracet(3, "swapfile: fpos=%" PRIdPTR "\n time=%s\n", file->fp, time_str(time, 0)); /* return if old swap file open */ if (file->fp_tmp || file->fp_tag_tmp) @@ -500,7 +501,7 @@ void swapfile(file_t *file, gtime_t time, char *msg) /* close old swap file -------------------------------------------------------*/ void swapclose(file_t *file) { - tracet(3, "swapclose: fp_tmp=%d\n", file->fp_tmp); + tracet(3, "swapclose: fp_tmp=%" PRIdPTR "\n", file->fp_tmp); if (file->fp_tmp) { fclose(file->fp_tmp); @@ -534,7 +535,7 @@ int readfile(file_t *file, unsigned char *buff, int nmax, char *msg) { return 0; } - tracet(4, "readfile: fp=%d nmax=%d\n", file->fp, nmax); + tracet(4, "readfile: fp=%zd nmax=%d\n", file->fp, nmax); if (file->fp == stdin) { @@ -617,7 +618,7 @@ int readfile(file_t *file, unsigned char *buff, int nmax, char *msg) sprintf(msg, "end"); } } - tracet(5, "readfile: fp=%d nr=%d fpos=%d\n", file->fp, nr, file->fpos); + tracet(5, "readfile: fpos=%" PRIdPTR "\n nr=%d fpos=%zd\n", file->fp, nr, file->fpos); return nr; } @@ -640,7 +641,7 @@ int writefile(file_t *file, unsigned char *buff, int n, char *msg) { return 0; } - tracet(3, "writefile: fp=%d n=%d\n", file->fp, n); + tracet(3, "writefile: fpos=%" PRIdPTR "\n n=%d\n", file->fp, n); wtime = utc2gpst(timeget()); /* write time in gpst */ @@ -693,7 +694,7 @@ int writefile(file_t *file, unsigned char *buff, int n, char *msg) fflush(file->fp_tag_tmp); } } - tracet(5, "writefile: fp=%d ns=%d tick=%5d fpos=%d\n", file->fp, ns, tick, fpos); + tracet(5, "writefile: fpos=%" PRIdPTR "\n ns=%d tick=%5d fpos=%zd\n", file->fp, ns, tick, fpos); return static_cast(ns); } @@ -1497,7 +1498,7 @@ int reqntrip_s(ntrip_t *ntrip, char *msg) return 0; } - tracet(2, "reqntrip_s: send request state=%d ns=%d\n", ntrip->state, p - buff); + tracet(2, "reqntrip_s: send request state=%d ns=%" PRIdPTR "\n", ntrip->state, p - buff); tracet(5, "reqntrip_s: n=%d buff=\n%s\n", p - buff, buff); ntrip->state = 1; return 1; @@ -1535,7 +1536,7 @@ int reqntrip_c(ntrip_t *ntrip, char *msg) return 0; } - tracet(2, "reqntrip_c: send request state=%d ns=%d\n", ntrip->state, p - buff); + tracet(2, "reqntrip_c: send request state=%d ns=%" PRIdPTR "\n", ntrip->state, p - buff); tracet(5, "reqntrip_c: n=%d buff=\n%s\n", p - buff, buff); ntrip->state = 1; return 1;