diff --git a/CMakeLists.txt b/CMakeLists.txt index aaabf7aa8..116d62866 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -735,8 +735,8 @@ if(NOT ARMADILLO_FOUND) message(STATUS " Armadillo will be downloaded and built automatically ") message(STATUS " when doing 'make'. ") - set(armadillo_RELEASE 6.500.4) - set(armadillo_MD5 "a39f27197d24b3d25437fab6bb1d118f") + set(armadillo_RELEASE 6.600.5) + set(armadillo_MD5 "649a6d0ed528c2e39c1102b43710103f") ExternalProject_Add( armadillo-${armadillo_RELEASE} diff --git a/README.md b/README.md index 9a9687691..dc419957d 100644 --- a/README.md +++ b/README.md @@ -73,9 +73,9 @@ In case you do not want to use PyBOMBS and prefer to build and install GNU Radio $ sudo apt-get install libopenblas-dev liblapack-dev # For Debian/Ubuntu/LinuxMint $ sudo yum install lapack-devel blas-devel # For Fedora/CentOS/RHEL $ sudo zypper install lapack-devel blas-devel # For OpenSUSE -$ wget http://sourceforge.net/projects/arma/files/armadillo-6.500.4.tar.gz -$ tar xvfz armadillo-6.500.4.tar.gz -$ cd armadillo-6.500.4 +$ wget http://sourceforge.net/projects/arma/files/armadillo-6.600.5.tar.gz +$ tar xvfz armadillo-6.600.5.tar.gz +$ cd armadillo-6.600.5 $ cmake . $ make $ sudo make install diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h index 832b34a08..2bc8d1aba 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h @@ -218,7 +218,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out _out += 4; } - _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg); + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; for (unsigned int i = sse_iters * 4; i < num_points; ++i) @@ -369,7 +369,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc _out += 4; } - _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg); + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; for (unsigned int i = sse_iters * 4; i < num_points; ++i) @@ -470,7 +470,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out _out += 4; } - _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg); + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; for (unsigned int i = sse_iters * 4; i < num_points; ++i) @@ -620,7 +620,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc _out += 4; } - _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg); + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; for (unsigned int i = sse_iters * 4; i < num_points; ++i) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h index 1dc13b49a..1d1c0beea 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h @@ -290,6 +290,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con result = _mm256_or_si256(realcacc, imagcacc); _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector + _mm256_zeroupper(); for (i = 0; i < 8; ++i) { @@ -365,6 +366,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con result = _mm256_or_si256(realcacc, imagcacc); _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector + _mm256_zeroupper(); for (i = 0; i < 8; ++i) { diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h index 0480a8532..dfdf13d3a 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h @@ -369,6 +369,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul volk_gnsssdr_free(realcacc); volk_gnsssdr_free(imagcacc); } + _mm256_zeroupper(); for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) { @@ -461,6 +462,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul volk_gnsssdr_free(realcacc); volk_gnsssdr_free(imagcacc); } + _mm256_zeroupper(); for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) { diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h index 50df6d4cf..aead40de8 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h @@ -225,7 +225,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con _in_b += 8; _out += 8; } - + _mm256_zeroupper(); number = avx2_points * 8; for(;number < num_points; number++) { @@ -279,7 +279,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con _in_b += 8; _out += 8; } - + _mm256_zeroupper(); number = avx2_points * 8; for(;number < num_points; number++) { diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h index 48129502a..91b3da8d8 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h @@ -71,11 +71,10 @@ #include #include - #include #include #include -#include +//#include #ifdef LV_HAVE_GENERIC @@ -719,7 +718,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_storeu_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector dotProduct = lv_cmake(0,0); for (int i = 0; i < 4; ++i) { @@ -731,7 +730,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ volk_gnsssdr_free(realcacc); volk_gnsssdr_free(imagcacc); - _mm_storeu_ps((float*)two_phase_acc, two_phase_acc_reg); + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; for(unsigned int n = sse_iters * 4; n < num_points; n++) @@ -751,6 +750,522 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ #endif /* LV_HAVE_SSE3 */ +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 8; + const lv_16sc_t** _in_a = in_a; + const lv_16sc_t* _in_common = in_common; + lv_16sc_t* _out = result; + + lv_16sc_t tmp16; + lv_32fc_t tmp32; + + __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0,0); + + __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); + __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + realcacc[n_vec] = _mm256_setzero_si256(); + imagcacc[n_vec] = _mm256_setzero_si256(); + } + + const __m256i mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + const __m256i mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + + __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; + __m128i c1, c2, result1, result2; + __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2]; + two_phase_inc[0] = phase_inc * phase_inc; + two_phase_inc[1] = phase_inc * phase_inc; + two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); + __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2]; + two_phase_acc[0] = (*phase); + two_phase_acc[1] = (*phase) * phase_inc; + two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + + __m256i a2, b2, c, c_sr, real, imag; + + __m128 yl, yh, tmp1, tmp2, tmp3; + + for(unsigned int number = 0; number < avx2_iters; number++) + { + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //next two samples + _in_common += 2; + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + // store four output samples + result1 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + _in_common += 2; + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //next two samples + _in_common += 2; + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + __builtin_prefetch(_in_common + 16); + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + // store four output samples + result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + _in_common += 2; + b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1); + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][number * 8])); + + c = _mm256_mullo_epi16(a2, b2); + + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm256_subs_epi16(c, c_sr); + + c_sr = _mm256_slli_si256(b2, 2); + c = _mm256_mullo_epi16(a2, c_sr); + + c_sr = _mm256_slli_si256(a2, 2); + imag = _mm256_mullo_epi16(b2, c_sr); + + imag = _mm256_adds_epi16(c, imag); + + realcacc[n_vec] = _mm256_adds_epi16(realcacc[n_vec], real); + imagcacc[n_vec] = _mm256_adds_epi16(imagcacc[n_vec], imag); + } + // Regenerate phase + if ((number % 128) == 0) + { + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + } + } + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + realcacc[n_vec] = _mm256_and_si256(realcacc[n_vec], mask_real); + imagcacc[n_vec] = _mm256_and_si256(imagcacc[n_vec], mask_imag); + + a2 = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); + + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (int i = 0; i < 8; ++i) + { + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + } + _out[n_vec] = dotProduct; + } + + volk_gnsssdr_free(realcacc); + volk_gnsssdr_free(imagcacc); + _mm256_zeroupper(); + + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); + (*phase) = two_phase_acc[0]; + + for(unsigned int n = avx2_iters * 8; n < num_points; n++) + { + tmp16 = in_common[n]; + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; + _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + } + } + +} +#endif /* LV_HAVE_AVX2 */ + + +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 8; + const unsigned int ROTATOR_RELOAD = 128; + + const lv_16sc_t** _in_a = in_a; + const lv_16sc_t* _in_common = in_common; + lv_16sc_t* _out = result; + + lv_16sc_t tmp16; + lv_32fc_t tmp32; + + __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0,0); + + __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); + __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + realcacc[n_vec] = _mm256_setzero_si256(); + imagcacc[n_vec] = _mm256_setzero_si256(); + } + + const __m256i mask_imag = _mm256_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); + const __m256i mask_real = _mm256_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); + + __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; + __m128i c1, c2, result1, result2; + __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2]; + two_phase_inc[0] = phase_inc * phase_inc; + two_phase_inc[1] = phase_inc * phase_inc; + two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); + __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2]; + two_phase_acc[0] = (*phase); + two_phase_acc[1] = (*phase) * phase_inc; + two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + + __m256i a2, b2, c, c_sr, real, imag; + + __m128 yl, yh, tmp1, tmp2, tmp3; + + for (unsigned int number = 0; number < avx2_iters / ROTATOR_RELOAD; ++number) + { + for (unsigned int j = 0; j < ROTATOR_RELOAD; j++) + { + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //next two samples + _in_common += 2; + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + // store four output samples + result1 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + _in_common += 2; + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //next two samples + _in_common += 2; + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + __builtin_prefetch(_in_common + 16); + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + // store four output samples + result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + _in_common += 2; + b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1); + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 8])); + + c = _mm256_mullo_epi16(a2, b2); + + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm256_subs_epi16(c, c_sr); + + c_sr = _mm256_slli_si256(b2, 2); + c = _mm256_mullo_epi16(a2, c_sr); + + c_sr = _mm256_slli_si256(a2, 2); + imag = _mm256_mullo_epi16(b2, c_sr); + + imag = _mm256_adds_epi16(c, imag); + + realcacc[n_vec] = _mm256_adds_epi16(realcacc[n_vec], real); + imagcacc[n_vec] = _mm256_adds_epi16(imagcacc[n_vec], imag); + } + } + // regenerate phase + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + } + + for (unsigned int j = 0; j < avx2_iters % ROTATOR_RELOAD; j++) + { + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //next two samples + _in_common += 2; + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + // store four output samples + result1 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + _in_common += 2; + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //next two samples + _in_common += 2; + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + __builtin_prefetch(_in_common + 16); + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + // store four output samples + result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + _in_common += 2; + b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1); + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][((avx2_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 8])); + + c = _mm256_mullo_epi16(a2, b2); + + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm256_subs_epi16(c, c_sr); + + c_sr = _mm256_slli_si256(b2, 2); + c = _mm256_mullo_epi16(a2, c_sr); + + c_sr = _mm256_slli_si256(a2, 2); + imag = _mm256_mullo_epi16(b2, c_sr); + + imag = _mm256_adds_epi16(c, imag); + + realcacc[n_vec] = _mm256_adds_epi16(realcacc[n_vec], real); + imagcacc[n_vec] = _mm256_adds_epi16(imagcacc[n_vec], imag); + } + } + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + realcacc[n_vec] = _mm256_and_si256(realcacc[n_vec], mask_real); + imagcacc[n_vec] = _mm256_and_si256(imagcacc[n_vec], mask_imag); + + a2 = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); + + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (int i = 0; i < 8; ++i) + { + dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + } + _out[n_vec] = dotProduct; + } + _mm256_zeroupper(); + volk_gnsssdr_free(realcacc); + volk_gnsssdr_free(imagcacc); + + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); + (*phase) = two_phase_acc[0]; + + for(unsigned int n = avx2_iters * 8; n < num_points; n++) + { + tmp16 = in_common[n]; + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; + _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + } + } +} + +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_NEON #include @@ -855,9 +1370,6 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* _phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); _phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - vst1q_f32((float32_t*)__phase_real, _phase_real); - vst1q_f32((float32_t*)__phase_imag, _phase_imag); - for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) { a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg @@ -1069,9 +1581,6 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s } - vst1q_f32((float32_t*)__phase_real, _phase_real); - vst1q_f32((float32_t*)__phase_imag, _phase_imag); - for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) { a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); @@ -1250,9 +1759,6 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ _phase_imag = vld1q_f32(____phase_imag); } - vst1q_f32((float32_t*)__phase_real, _phase_real); - vst1q_f32((float32_t*)__phase_imag, _phase_imag); - for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) { a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h index d5e8c1e0c..7327812c2 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h @@ -193,6 +193,130 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1 #endif // SSE3 +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } + + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX2 + + +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } + + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX2 + + +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } + + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX2 + + +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.345; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } + + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX2 + + #ifdef LV_HAVE_NEON static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h index 32532f4ab..5a007d455 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h @@ -37,7 +37,8 @@ * * \b Overview * - * Computes the sine and cosine of a vector of floats, providing the output in a complex vector (cosine, sine) + * VOLK_GNSSSDR kernel that computes the sine and cosine of a vector + * of floats, providing the output in a complex vector (cosine, sine) * * Dispatcher Prototype * \code @@ -133,12 +134,13 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); cplxValue = _mm_unpacklo_ps(cosine, sine); - _mm_storeu_ps((float*)bPtr, cplxValue); bPtr += 2; + cplxValue = _mm_unpackhi_ps(cosine, sine); _mm_storeu_ps((float*)bPtr, cplxValue); bPtr += 2; + aPtr += 4; } @@ -226,12 +228,13 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3)); cplxValue = _mm_unpacklo_ps(cosine, sine); - _mm_store_ps((float*)bPtr, cplxValue); bPtr += 2; + cplxValue = _mm_unpackhi_ps(cosine, sine); _mm_store_ps((float*)bPtr, cplxValue); bPtr += 2; + aPtr += 4; } @@ -587,7 +590,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con _in = *in++; d = (int32_t)floor(_in / TWO_PI + 0.5); _in -= d * TWO_PI; - x = (int32_t) ((float) _in * TWO_TO_THE_31_DIV_PI); + x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI); ux = x; sin_index = ux >> diffbits; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h new file mode 100644 index 000000000..ef00e5db2 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h @@ -0,0 +1,769 @@ +/*! + * \file volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h + * \brief VOLK_GNSSSDR kernel: multiplies N complex (32-bit float per component) vectors + * by a common vector, phase rotated and accumulates the results in N float complex outputs. + * \authors
    + *
  • Carles Fernandez-Prades, 2016. cfernandez(at)cttc.es + *
+ * + * VOLK_GNSSSDR kernel that multiplies N 32 bits complex vectors by a common vector, which is + * phase-rotated by phase offset and phase increment, and accumulates the results + * in N 32 bits float complex outputs. + * It is optimized to perform the N tap correlation process in GNSS receivers. + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +/*! + * \page volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn + * + * \b Overview + * + * Rotates and multiplies the reference complex vector with an arbitrary number of other complex vectors, + * accumulates the results and stores them in the output vector. + * The rotation is done at a fixed rate per sample, from an initial \p phase offset. + * This function can be used for Doppler wipe-off and multiple correlator. + * + * Dispatcher Prototype + * \code + * void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points); + * \endcode + * + * \b Inputs + * \li in_common: Pointer to one of the vectors to be rotated, multiplied and accumulated (reference vector). + * \li phase_inc: Phase increment = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)) + * \li phase: Initial phase = lv_cmake(cos(initial_phase_rad), sin(initial_phase_rad)) + * \li in_a: Pointer to an array of pointers to multiple vectors to be multiplied and accumulated. + * \li num_a_vectors: Number of vectors to be multiplied by the reference vector and accumulated. + * \li num_points: Number of complex values to be multiplied together, accumulated and stored into \p result. + * + * \b Outputs + * \li phase: Final phase. + * \li result: Vector of \p num_a_vectors components with the multiple vectors of \p in_a rotated, multiplied by \p in_common and accumulated. + * + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H +#define INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H + + +#include +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_32fc_t tmp32_1, tmp32_2; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + result[n_vec] = lv_cmake(0,0); + } + for (unsigned int n = 0; n < num_points; n++) + { + tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + + // Regenerate phase + if (n % 256 == 0) + { + //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); +#ifdef __cplusplus + (*phase) /= std::abs((*phase)); +#else + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); +#endif + //printf("Phase after regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + } + + (*phase) *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * in_a[n_vec][n]; + result[n_vec] += tmp32_2; + } + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_32fc_t tmp32_1, tmp32_2; + const unsigned int ROTATOR_RELOAD = 256; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + result[n_vec] = lv_cmake(0,0); + } + + for (unsigned int n = 0; n < num_points / ROTATOR_RELOAD; n++) + { + for (unsigned int j = 0; j < ROTATOR_RELOAD; j++) + { + tmp32_1 = *in_common++ * (*phase); + (*phase) *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * in_a[n_vec][n * ROTATOR_RELOAD + j]; + result[n_vec] += tmp32_2; + } + } + /* Regenerate phase */ +#ifdef __cplusplus + (*phase) /= std::abs((*phase)); +#else + //(*phase) /= cabsf((*phase)); + (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase)); +#endif + } + + for (unsigned int j = 0; j < num_points % ROTATOR_RELOAD; j++) + { + tmp32_1 = *in_common++ * (*phase); + (*phase) *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; + result[n_vec] += tmp32_2; + } + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#ifdef LV_HAVE_SSE3 +#include +static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t tmp32_1, tmp32_2; + const unsigned int sse_iters = num_points / 2; + + const lv_32fc_t** _in_a = in_a; + const lv_32fc_t* _in_common = in_common; + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + acc[n_vec] = _mm_setzero_ps(); + } + + // phase rotation registers + __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; + + __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2]; + two_phase_inc[0] = phase_inc * phase_inc; + two_phase_inc[1] = phase_inc * phase_inc; + two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); + __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2]; + two_phase_acc[0] = (*phase); + two_phase_acc[1] = (*phase) * phase_inc; + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); + + const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg); + const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg); + + for(unsigned int number = 0; number < sse_iters; number++) + { + // Phase rotation on operand in_common starts here: + a = _mm_loadu_ps((float*)_in_common); + // __builtin_prefetch(_in_common + 4); + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); + tmp1 = _mm_mul_ps(a, yl); + tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp); + a = _mm_shuffle_ps(a, a, 0xB1); + two_phase_acc_reg = _mm_shuffle_ps(two_phase_acc_reg, two_phase_acc_reg, 0xB1); + tmp2 = _mm_mul_ps(a, yh); + tmp2p = _mm_mul_ps(two_phase_acc_reg, yhp); + z1 = _mm_addsub_ps(tmp1, tmp2); + two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p); + + yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(z1); + + //next two samples + _in_common += 2; + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a = _mm_loadu_ps((float*)&(_in_a[n_vec][number*2])); + tmp1 = _mm_mul_ps(a, yl); + a = _mm_shuffle_ps(a, a, 0xB1); + tmp2 = _mm_mul_ps(a, yh); + z1 = _mm_addsub_ps(tmp1, tmp2); + acc[n_vec] = _mm_add_ps(acc[n_vec], z1); + } + // Regenerate phase + if ((number % 128) == 0) + { + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + } + } + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (int i = 0; i < 2; ++i) + { + dotProduct = dotProduct + dotProductVector[i]; + } + result[n_vec] = dotProduct; + } + volk_gnsssdr_free(acc); + + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); + (*phase) = two_phase_acc[0]; + + for(unsigned int n = sse_iters * 2; n < num_points; n++) + { + tmp32_1 = in_common[n] * (*phase); + (*phase) *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * in_a[n_vec][n]; + result[n_vec] += tmp32_2; + } + } +} +#endif /* LV_HAVE_SSE3 */ + + +#ifdef LV_HAVE_SSE3 +#include +static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t tmp32_1, tmp32_2; + const unsigned int sse_iters = num_points / 2; + + const lv_32fc_t** _in_a = in_a; + const lv_32fc_t* _in_common = in_common; + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + acc[n_vec] = _mm_setzero_ps(); + } + + // phase rotation registers + __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; + + __attribute__((aligned(16))) lv_32fc_t two_phase_inc[2]; + two_phase_inc[0] = phase_inc * phase_inc; + two_phase_inc[1] = phase_inc * phase_inc; + two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); + __attribute__((aligned(16))) lv_32fc_t two_phase_acc[2]; + two_phase_acc[0] = (*phase); + two_phase_acc[1] = (*phase) * phase_inc; + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); + + const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg); + const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg); + + for(unsigned int number = 0; number < sse_iters; number++) + { + // Phase rotation on operand in_common starts here: + a = _mm_load_ps((float*)_in_common); + // __builtin_prefetch(_in_common + 4); + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); + tmp1 = _mm_mul_ps(a, yl); + tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp); + a = _mm_shuffle_ps(a, a, 0xB1); + two_phase_acc_reg = _mm_shuffle_ps(two_phase_acc_reg, two_phase_acc_reg, 0xB1); + tmp2 = _mm_mul_ps(a, yh); + tmp2p = _mm_mul_ps(two_phase_acc_reg, yhp); + z1 = _mm_addsub_ps(tmp1, tmp2); + two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p); + + yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(z1); + + //next two samples + _in_common += 2; + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a = _mm_load_ps((float*)&(_in_a[n_vec][number*2])); + tmp1 = _mm_mul_ps(a, yl); + a = _mm_shuffle_ps(a, a, 0xB1); + tmp2 = _mm_mul_ps(a, yh); + z1 = _mm_addsub_ps(tmp1, tmp2); + acc[n_vec] = _mm_add_ps(acc[n_vec], z1); + } + // Regenerate phase + if ((number % 128) == 0) + { + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + } + } + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (int i = 0; i < 2; ++i) + { + dotProduct = dotProduct + dotProductVector[i]; + } + result[n_vec] = dotProduct; + } + volk_gnsssdr_free(acc); + + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); + (*phase) = two_phase_acc[0]; + + for(unsigned int n = sse_iters * 2; n < num_points; n++) + { + tmp32_1 = in_common[n] * (*phase); + (*phase) *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * in_a[n_vec][n]; + result[n_vec] += tmp32_2; + } + } +} +#endif /* LV_HAVE_SSE3 */ + + +#ifdef LV_HAVE_AVX +#include +static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t tmp32_1, tmp32_2; + const unsigned int avx_iters = num_points / 4; + + const lv_32fc_t** _in_a = in_a; + const lv_32fc_t* _in_common = in_common; + lv_32fc_t _phase = (*phase); + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + + __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + acc[n_vec] = _mm256_setzero_ps(); + result[n_vec] = lv_cmake(0, 0); + } + + // phase rotation registers + __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z; + + __attribute__((aligned(32))) lv_32fc_t four_phase_inc[4]; + const lv_32fc_t phase_inc2 = phase_inc * phase_inc; + const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc; + const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc; + four_phase_inc[0] = phase_inc4; + four_phase_inc[1] = phase_inc4; + four_phase_inc[2] = phase_inc4; + four_phase_inc[3] = phase_inc4; + const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); + + __attribute__((aligned(32))) lv_32fc_t four_phase_acc[4]; + four_phase_acc[0] = _phase; + four_phase_acc[1] = _phase * phase_inc; + four_phase_acc[2] = _phase * phase_inc2; + four_phase_acc[3] = _phase * phase_inc3; + four_phase_acc_reg = _mm256_load_ps((float*)four_phase_acc); + + const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg); + const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg); + + for(unsigned int number = 0; number < avx_iters; number++) + { + // Phase rotation on operand in_common starts here: + a = _mm256_loadu_ps((float*)_in_common); + __builtin_prefetch(_in_common + 16); + yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(four_phase_acc_reg); + tmp1 = _mm256_mul_ps(a, yl); + tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp); + a = _mm256_shuffle_ps(a, a, 0xB1); + four_phase_acc_reg = _mm256_shuffle_ps(four_phase_acc_reg, four_phase_acc_reg, 0xB1); + tmp2 = _mm256_mul_ps(a, yh); + tmp2p = _mm256_mul_ps(four_phase_acc_reg, yhp); + z = _mm256_addsub_ps(tmp1, tmp2); + four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p); + + yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(z); + + //next two samples + _in_common += 4; + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a = _mm256_loadu_ps((float*)&(_in_a[n_vec][number * 4])); + tmp1 = _mm256_mul_ps(a, yl); + a = _mm256_shuffle_ps(a, a, 0xB1); + tmp2 = _mm256_mul_ps(a, yh); + z = _mm256_addsub_ps(tmp1, tmp2); + acc[n_vec] = _mm256_add_ps(acc[n_vec], z); + } + // Regenerate phase + if ((number % 128) == 0) + { + tmp1 = _mm256_mul_ps(four_phase_acc_reg, four_phase_acc_reg); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm256_sqrt_ps(tmp1); + four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); + } + } + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (int i = 0; i < 4; ++i) + { + dotProduct = dotProduct + dotProductVector[i]; + } + result[n_vec] = dotProduct; + } + volk_gnsssdr_free(acc); + + tmp1 = _mm256_mul_ps(four_phase_acc_reg, four_phase_acc_reg); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm256_sqrt_ps(tmp1); + four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); + + _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); + _phase = four_phase_acc[0]; + _mm256_zeroupper(); + + for(unsigned int n = avx_iters * 4; n < num_points; n++) + { + tmp32_1 = *_in_common++ * _phase; + _phase *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * _in_a[n_vec][n]; + result[n_vec] += tmp32_2; + } + } + (*phase) = _phase; +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_AVX +#include +static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t tmp32_1, tmp32_2; + const unsigned int avx_iters = num_points / 4; + + const lv_32fc_t** _in_a = in_a; + const lv_32fc_t* _in_common = in_common; + lv_32fc_t _phase = (*phase); + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + + __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + acc[n_vec] = _mm256_setzero_ps(); + result[n_vec] = lv_cmake(0, 0); + } + + // phase rotation registers + __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z; + + __attribute__((aligned(32))) lv_32fc_t four_phase_inc[4]; + const lv_32fc_t phase_inc2 = phase_inc * phase_inc; + const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc; + const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc; + four_phase_inc[0] = phase_inc4; + four_phase_inc[1] = phase_inc4; + four_phase_inc[2] = phase_inc4; + four_phase_inc[3] = phase_inc4; + const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); + + __attribute__((aligned(32))) lv_32fc_t four_phase_acc[4]; + four_phase_acc[0] = _phase; + four_phase_acc[1] = _phase * phase_inc; + four_phase_acc[2] = _phase * phase_inc2; + four_phase_acc[3] = _phase * phase_inc3; + four_phase_acc_reg = _mm256_load_ps((float*)four_phase_acc); + + const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg); + const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg); + + for(unsigned int number = 0; number < avx_iters; number++) + { + // Phase rotation on operand in_common starts here: + a = _mm256_load_ps((float*)_in_common); + __builtin_prefetch(_in_common + 16); + yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(four_phase_acc_reg); + tmp1 = _mm256_mul_ps(a, yl); + tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp); + a = _mm256_shuffle_ps(a, a, 0xB1); + four_phase_acc_reg = _mm256_shuffle_ps(four_phase_acc_reg, four_phase_acc_reg, 0xB1); + tmp2 = _mm256_mul_ps(a, yh); + tmp2p = _mm256_mul_ps(four_phase_acc_reg, yhp); + z = _mm256_addsub_ps(tmp1, tmp2); + four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p); + + yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(z); + + //next two samples + _in_common += 4; + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a = _mm256_load_ps((float*)&(_in_a[n_vec][number * 4])); + tmp1 = _mm256_mul_ps(a, yl); + a = _mm256_shuffle_ps(a, a, 0xB1); + tmp2 = _mm256_mul_ps(a, yh); + z = _mm256_addsub_ps(tmp1, tmp2); + acc[n_vec] = _mm256_add_ps(acc[n_vec], z); + } + // Regenerate phase + if ((number % 128) == 0) + { + tmp1 = _mm256_mul_ps(four_phase_acc_reg, four_phase_acc_reg); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm256_sqrt_ps(tmp1); + four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); + } + } + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (int i = 0; i < 4; ++i) + { + dotProduct = dotProduct + dotProductVector[i]; + } + result[n_vec] = dotProduct; + } + volk_gnsssdr_free(acc); + + tmp1 = _mm256_mul_ps(four_phase_acc_reg, four_phase_acc_reg); + tmp2 = _mm256_hadd_ps(tmp1, tmp1); + tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm256_sqrt_ps(tmp1); + four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); + + _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); + _phase = four_phase_acc[0]; + _mm256_zeroupper(); + + for(unsigned int n = avx_iters * 4; n < num_points; n++) + { + tmp32_1 = *_in_common++ * _phase; + _phase *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * _in_a[n_vec][n]; + result[n_vec] += tmp32_2; + } + } + (*phase) = _phase; +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_NEON +#include + +static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) +{ + const unsigned int neon_iters = num_points / 4; + + const lv_32fc_t** _in_a = in_a; + const lv_32fc_t* _in_common = in_common; + lv_32fc_t* _out = result; + + lv_32fc_t _phase = (*phase); + lv_32fc_t tmp32_1, tmp32_2; + + if (neon_iters > 0) + { + lv_32fc_t dotProduct = lv_cmake(0,0); + float32_t arg_phase0 = cargf(_phase); + float32_t arg_phase_inc = cargf(phase_inc); + float32_t phase_est; + + lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; + __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; + __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + + float32x4_t _phase4_real = vld1q_f32(__phase4_real); + float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); + + lv_32fc_t phase2 = (lv_32fc_t)(_phase) * phase_inc; + lv_32fc_t phase3 = phase2 * phase_inc; + lv_32fc_t phase4 = phase3 * phase_inc; + + __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; + __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + + float32x4_t _phase_real = vld1q_f32(__phase_real); + float32x4_t _phase_imag = vld1q_f32(__phase_imag); + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + + float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag; + + float32x4x2_t* accumulator1 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment()); + float32x4x2_t* accumulator2 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment()); + + for(int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + accumulator1[n_vec].val[0] = vdupq_n_f32(0.0f); + accumulator1[n_vec].val[1] = vdupq_n_f32(0.0f); + accumulator2[n_vec].val[0] = vdupq_n_f32(0.0f); + accumulator2[n_vec].val[1] = vdupq_n_f32(0.0f); + } + + for(unsigned int number = 0; number < neon_iters; number++) + { + /* load 4 complex numbers (float 32 bits each component) */ + b_val = vld2q_f32((float32_t*)_in_common); + __builtin_prefetch(_in_common + 8); + _in_common += 4; + + /* complex multiplication of four complex samples (float 32 bits each component) */ + tmp32_real.val[0] = vmulq_f32(b_val.val[0], _phase_real); + tmp32_real.val[1] = vmulq_f32(b_val.val[1], _phase_imag); + tmp32_imag.val[0] = vmulq_f32(b_val.val[0], _phase_imag); + tmp32_imag.val[1] = vmulq_f32(b_val.val[1], _phase_real); + + b_val.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); + b_val.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); + + /* compute next four phases */ + tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); + tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); + tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); + tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); + + _phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); + _phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); + + // Regenerate phase + if ((number % 128) == 0) + { + phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; + + _phase = lv_cmake(cos(phase_est), sin(phase_est)); + phase2 = _phase * phase_inc; + phase3 = phase2 * phase_inc; + phase4 = phase3 * phase_inc; + + __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; + __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + + _phase_real = vld1q_f32(____phase_real); + _phase_imag = vld1q_f32(____phase_imag); + } + + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + a_val = vld2q_f32((float32_t*)&(_in_a[n_vec][number * 4])); + + // use 2 accumulators to remove inter-instruction data dependencies + accumulator1[n_vec].val[0] = vmlaq_f32(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); + accumulator2[n_vec].val[0] = vmlsq_f32(accumulator2[n_vec].val[0], a_val.val[1], b_val.val[1]); + accumulator1[n_vec].val[1] = vmlaq_f32(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); + accumulator2[n_vec].val[1] = vmlaq_f32(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]); + } + } + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + accumulator1[n_vec].val[0] = vaddq_f32(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]); + accumulator1[n_vec].val[1] = vaddq_f32(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]); + } + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0,0); + for (int i = 0; i < 4; ++i) + { + dotProduct = dotProduct + dotProductVector[i]; + } + _out[n_vec] = dotProduct; + } + volk_gnsssdr_free(accumulator1); + volk_gnsssdr_free(accumulator2); + + vst1q_f32((float32_t*)__phase_real, _phase_real); + vst1q_f32((float32_t*)__phase_imag, _phase_imag); + + _phase = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); + } + + for(unsigned int n = neon_iters * 4; n < num_points; n++) + { + tmp32_1 = in_common[n] * _phase; + _phase *= phase_inc; + for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) + { + tmp32_2 = tmp32_1 * in_a[n_vec][n]; + _out[n_vec] += tmp32_2; + } + } + (*phase) = _phase; +} + +#endif /* LV_HAVE_NEON */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */ + diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h new file mode 100644 index 000000000..bbd07d8ec --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h @@ -0,0 +1,253 @@ +/*! + * \file volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h + * \brief Volk puppet for the multiple 16-bit complex dot product kernel. + * \authors
    + *
  • Carles Fernandez Prades 2016 cfernandez at cttc dot cat + *
+ * + * Volk puppet for integrating the resampler into volk's test system + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_H +#define INCLUDED_volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_H + +#include "volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h" +#include +#include +#include + +#ifdef LV_HAVE_GENERIC +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); + } + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // Generic + + +#ifdef LV_HAVE_GENERIC +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); + } + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // Generic + + +#ifdef LV_HAVE_SSE3 +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); + } + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // SSE3 + + +#ifdef LV_HAVE_SSE3 +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); + } + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // SSE3 + + +#ifdef LV_HAVE_AVX +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); + } + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX + + +#ifdef LV_HAVE_AVX +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); + } + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX + + +#ifdef LV_HAVE_NEON +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +{ + // phases must be normalized. Phase rotator expects a complex exponential input! + float rem_carrier_phase_in_rad = 0.25; + float phase_step_rad = 0.1; + lv_32fc_t phase[1]; + phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); + lv_32fc_t phase_inc[1]; + phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); + + int num_a_vectors = 3; + lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for(unsigned int n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); + } + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + + for(unsigned int n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } + volk_gnsssdr_free(in_a); +} + +#endif // AVX + + +#endif // INCLUDED_volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h index 0f76067cb..8fbadeaf5 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h @@ -433,6 +433,342 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co #endif /* LV_HAVE_GENERIC */ +#ifdef LV_HAVE_AVX2 +#include +/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ + * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ +static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +{ + lv_32fc_t* bPtr = out; + + const unsigned int avx_iters = num_points / 8; + unsigned int number = 0; + + float _phase = (*phase); + + __m256 sine, cosine, x, eight_phases_reg; + __m256 xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y; + __m256i emm0, emm2, emm4; + __m128 aux, c1, s1; + + /* declare some AXX2 constants */ + static const int _ps_inv_sign_mask[8] __attribute__((aligned(32))) = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; + static const int _ps_sign_mask[8] __attribute__((aligned(32))) = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + + static const float _ps_cephes_FOPI[8] __attribute__((aligned(32))) = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; + static const int _pi32_1[8] __attribute__((aligned(32))) = { 1, 1, 1, 1, 1, 1, 1, 1 }; + static const int _pi32_inv1[8] __attribute__((aligned(32))) = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; + static const int _pi32_2[8] __attribute__((aligned(32))) = { 2, 2, 2, 2, 2, 2, 2, 2 }; + static const int _pi32_4[8] __attribute__((aligned(32))) = { 4, 4, 4, 4, 4, 4, 4, 4 }; + + static const float _ps_minus_cephes_DP1[8] __attribute__((aligned(32))) = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; + static const float _ps_minus_cephes_DP2[8] __attribute__((aligned(32))) = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; + static const float _ps_minus_cephes_DP3[8] __attribute__((aligned(32))) = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; + static const float _ps_coscof_p0[8] __attribute__((aligned(32))) = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; + static const float _ps_coscof_p1[8] __attribute__((aligned(32))) = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; + static const float _ps_coscof_p2[8] __attribute__((aligned(32))) = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; + static const float _ps_sincof_p0[8] __attribute__((aligned(32))) = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; + static const float _ps_sincof_p1[8] __attribute__((aligned(32))) = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; + static const float _ps_sincof_p2[8] __attribute__((aligned(32))) = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; + static const float _ps_0p5[8] __attribute__((aligned(32))) = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; + static const float _ps_1[8] __attribute__((aligned(32))) = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + + float eight_phases[8] __attribute__((aligned(32))) = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; + float eight_phases_inc[8] __attribute__((aligned(32))) = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; + eight_phases_reg = _mm256_load_ps(eight_phases); + const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); + + for(;number < avx_iters; number++) + { + x = eight_phases_reg; + + sign_bit_sin = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI); + + /* store the integer part of y in emm2 */ + emm2 = _mm256_cvttps_epi32(y); + + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm256_add_epi32(emm2, *(__m256i *)_pi32_1); + emm2 = _mm256_and_si256(emm2, *(__m256i *)_pi32_inv1); + y = _mm256_cvtepi32_ps(emm2); + + emm4 = emm2; + + /* get the swap sign flag for the sine */ + emm0 = _mm256_and_si256(emm2, *(__m256i *)_pi32_4); + emm0 = _mm256_slli_epi32(emm0, 29); + __m256 swap_sign_bit_sin = _mm256_castsi256_ps(emm0); + + /* get the polynom selection mask for the sine*/ + emm2 = _mm256_and_si256(emm2, *(__m256i *)_pi32_2); + emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); + __m256 poly_mask = _mm256_castsi256_ps(emm2); + + /* The magic pass: "Extended precision modular arithmetic” + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = *(__m256*)_ps_minus_cephes_DP1; + xmm2 = *(__m256*)_ps_minus_cephes_DP2; + xmm3 = *(__m256*)_ps_minus_cephes_DP3; + xmm1 = _mm256_mul_ps(y, xmm1); + xmm2 = _mm256_mul_ps(y, xmm2); + xmm3 = _mm256_mul_ps(y, xmm3); + x = _mm256_add_ps(x, xmm1); + x = _mm256_add_ps(x, xmm2); + x = _mm256_add_ps(x, xmm3); + + emm4 = _mm256_sub_epi32(emm4, *(__m256i *)_pi32_2); + emm4 = _mm256_andnot_si256(emm4, *(__m256i *)_pi32_4); + emm4 = _mm256_slli_epi32(emm4, 29); + __m256 sign_bit_cos = _mm256_castsi256_ps(emm4); + + sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + __m256 z = _mm256_mul_ps(x, x); + y = *(__m256*)_ps_coscof_p0; + + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2); + y = _mm256_mul_ps(y, z); + y = _mm256_mul_ps(y, z); + __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5); + y = _mm256_sub_ps(y, tmp); + y = _mm256_add_ps(y, *(__m256*)_ps_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + __m256 y2 = *(__m256*)_ps_sincof_p0; + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_mul_ps(y2, x); + y2 = _mm256_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + __m256 ysin2 = _mm256_and_ps(xmm3, y2); + __m256 ysin1 = _mm256_andnot_ps(xmm3, y); + y2 = _mm256_sub_ps(y2, ysin2); + y = _mm256_sub_ps(y, ysin1); + + xmm1 = _mm256_add_ps(ysin1, ysin2); + xmm2 = _mm256_add_ps(y, y2); + + /* update the sign */ + sine = _mm256_xor_ps(xmm1, sign_bit_sin); + cosine = _mm256_xor_ps(xmm2, sign_bit_cos); + + /* write the output */ + s1 = _mm256_extractf128_ps(sine, 0); + c1 = _mm256_extractf128_ps(cosine, 0); + aux = _mm_unpacklo_ps(c1, s1); + _mm_store_ps((float*)bPtr, aux); + bPtr += 2; + aux = _mm_unpackhi_ps(c1, s1); + _mm_store_ps((float*)bPtr, aux); + bPtr += 2; + s1 = _mm256_extractf128_ps(sine, 1); + c1 = _mm256_extractf128_ps(cosine, 1); + aux = _mm_unpacklo_ps(c1, s1); + _mm_store_ps((float*)bPtr, aux); + bPtr += 2; + aux = _mm_unpackhi_ps(c1, s1); + _mm_store_ps((float*)bPtr, aux); + bPtr += 2; + + eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg); + } + _mm256_zeroupper(); + _phase = _phase + phase_inc * (avx_iters * 8); + for(number = avx_iters * 8; number < num_points; number++) + { + out[number] = lv_cmake((float)cos(_phase), (float)sin(_phase) ); + _phase += phase_inc; + } + (*phase) = _phase; +} + +#endif /* LV_HAVE_AVX2 */ + + +#ifdef LV_HAVE_AVX2 +#include +/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ + * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ +static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +{ + lv_32fc_t* bPtr = out; + + const unsigned int avx_iters = num_points / 8; + unsigned int number = 0; + + float _phase = (*phase); + + __m256 sine, cosine, x, eight_phases_reg; + __m256 xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y; + __m256i emm0, emm2, emm4; + __m128 aux, c1, s1; + + /* declare some AXX2 constants */ + static const int _ps_inv_sign_mask[8] __attribute__((aligned(32))) = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; + static const int _ps_sign_mask[8] __attribute__((aligned(32))) = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + + static const float _ps_cephes_FOPI[8] __attribute__((aligned(32))) = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; + static const int _pi32_1[8] __attribute__((aligned(32))) = { 1, 1, 1, 1, 1, 1, 1, 1 }; + static const int _pi32_inv1[8] __attribute__((aligned(32))) = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; + static const int _pi32_2[8] __attribute__((aligned(32))) = { 2, 2, 2, 2, 2, 2, 2, 2 }; + static const int _pi32_4[8] __attribute__((aligned(32))) = { 4, 4, 4, 4, 4, 4, 4, 4 }; + + static const float _ps_minus_cephes_DP1[8] __attribute__((aligned(32))) = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; + static const float _ps_minus_cephes_DP2[8] __attribute__((aligned(32))) = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; + static const float _ps_minus_cephes_DP3[8] __attribute__((aligned(32))) = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; + static const float _ps_coscof_p0[8] __attribute__((aligned(32))) = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; + static const float _ps_coscof_p1[8] __attribute__((aligned(32))) = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; + static const float _ps_coscof_p2[8] __attribute__((aligned(32))) = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; + static const float _ps_sincof_p0[8] __attribute__((aligned(32))) = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; + static const float _ps_sincof_p1[8] __attribute__((aligned(32))) = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; + static const float _ps_sincof_p2[8] __attribute__((aligned(32))) = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; + static const float _ps_0p5[8] __attribute__((aligned(32))) = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; + static const float _ps_1[8] __attribute__((aligned(32))) = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + + float eight_phases[8] __attribute__((aligned(32))) = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; + float eight_phases_inc[8] __attribute__((aligned(32))) = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; + eight_phases_reg = _mm256_load_ps(eight_phases); + const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); + + for(;number < avx_iters; number++) + { + x = eight_phases_reg; + + sign_bit_sin = x; + /* take the absolute value */ + x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI); + + /* store the integer part of y in emm2 */ + emm2 = _mm256_cvttps_epi32(y); + + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm256_add_epi32(emm2, *(__m256i *)_pi32_1); + emm2 = _mm256_and_si256(emm2, *(__m256i *)_pi32_inv1); + y = _mm256_cvtepi32_ps(emm2); + + emm4 = emm2; + + /* get the swap sign flag for the sine */ + emm0 = _mm256_and_si256(emm2, *(__m256i *)_pi32_4); + emm0 = _mm256_slli_epi32(emm0, 29); + __m256 swap_sign_bit_sin = _mm256_castsi256_ps(emm0); + + /* get the polynom selection mask for the sine*/ + emm2 = _mm256_and_si256(emm2, *(__m256i *)_pi32_2); + emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); + __m256 poly_mask = _mm256_castsi256_ps(emm2); + + /* The magic pass: "Extended precision modular arithmetic” + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = *(__m256*)_ps_minus_cephes_DP1; + xmm2 = *(__m256*)_ps_minus_cephes_DP2; + xmm3 = *(__m256*)_ps_minus_cephes_DP3; + xmm1 = _mm256_mul_ps(y, xmm1); + xmm2 = _mm256_mul_ps(y, xmm2); + xmm3 = _mm256_mul_ps(y, xmm3); + x = _mm256_add_ps(x, xmm1); + x = _mm256_add_ps(x, xmm2); + x = _mm256_add_ps(x, xmm3); + + emm4 = _mm256_sub_epi32(emm4, *(__m256i *)_pi32_2); + emm4 = _mm256_andnot_si256(emm4, *(__m256i *)_pi32_4); + emm4 = _mm256_slli_epi32(emm4, 29); + __m256 sign_bit_cos = _mm256_castsi256_ps(emm4); + + sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + __m256 z = _mm256_mul_ps(x, x); + y = *(__m256*)_ps_coscof_p0; + + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2); + y = _mm256_mul_ps(y, z); + y = _mm256_mul_ps(y, z); + __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5); + y = _mm256_sub_ps(y, tmp); + y = _mm256_add_ps(y, *(__m256*)_ps_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + __m256 y2 = *(__m256*)_ps_sincof_p0; + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_mul_ps(y2, x); + y2 = _mm256_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + __m256 ysin2 = _mm256_and_ps(xmm3, y2); + __m256 ysin1 = _mm256_andnot_ps(xmm3, y); + y2 = _mm256_sub_ps(y2, ysin2); + y = _mm256_sub_ps(y, ysin1); + + xmm1 = _mm256_add_ps(ysin1, ysin2); + xmm2 = _mm256_add_ps(y, y2); + + /* update the sign */ + sine = _mm256_xor_ps(xmm1, sign_bit_sin); + cosine = _mm256_xor_ps(xmm2, sign_bit_cos); + + /* write the output */ + s1 = _mm256_extractf128_ps(sine, 0); + c1 = _mm256_extractf128_ps(cosine, 0); + aux = _mm_unpacklo_ps(c1, s1); + _mm_storeu_ps((float*)bPtr, aux); + bPtr += 2; + aux = _mm_unpackhi_ps(c1, s1); + _mm_storeu_ps((float*)bPtr, aux); + bPtr += 2; + s1 = _mm256_extractf128_ps(sine, 1); + c1 = _mm256_extractf128_ps(cosine, 1); + aux = _mm_unpacklo_ps(c1, s1); + _mm_storeu_ps((float*)bPtr, aux); + bPtr += 2; + aux = _mm_unpackhi_ps(c1, s1); + _mm_storeu_ps((float*)bPtr, aux); + bPtr += 2; + + eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg); + } + _mm256_zeroupper(); + _phase = _phase + phase_inc * (avx_iters * 8); + for(number = avx_iters * 8; number < num_points; number++) + { + out[number] = lv_cmake((float)cos(_phase), (float)sin(_phase) ); + _phase += phase_inc; + } + (*phase) = _phase; +} + +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_NEON #include /* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h index d0920daef..07d3bf5d2 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h @@ -83,6 +83,26 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_sse2(lv_32fc_t* out, co #endif /* LV_HAVE_SSE2 */ +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_avx2(lv_32fc_t* out, const float phase_inc, unsigned int num_points) +{ + float phase[1]; + phase[0] = 3; + volk_gnsssdr_s32f_sincos_32fc_a_avx2(out, phase_inc, phase, num_points); +} +#endif /* LV_HAVE_AVX2 */ + + +#ifdef LV_HAVE_AVX2 +static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_avx2(lv_32fc_t* out, const float phase_inc, unsigned int num_points) +{ + float phase[1]; + phase[0] = 3; + volk_gnsssdr_s32f_sincos_32fc_u_avx2(out, phase_inc, phase, num_points); +} +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_NEON static inline void volk_gnsssdr_s32f_sincospuppet_32fc_neon(lv_32fc_t* out, const float phase_inc, unsigned int num_points) { diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h index b03e0c62a..6daefb3f3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h @@ -89,6 +89,7 @@ std::vector init_test_list(volk_gnsssdr_test_params_t (VOLK_INIT_PUPP(volk_gnsssdr_16ic_resamplerxnpuppet_16ic, volk_gnsssdr_16ic_xn_resampler_16ic_xn, test_params)) (VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_dot_prod_16ic_xn, test_params)) (VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16)) + (VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1)) ; return test_cases; diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator.cc b/src/algorithms/tracking/libs/cpu_multicorrelator.cc index 0deea37b2..933d06e5a 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator.cc +++ b/src/algorithms/tracking/libs/cpu_multicorrelator.cc @@ -35,8 +35,29 @@ #include "cpu_multicorrelator.h" #include #include -#include // fixed point sine and cosine #include +#include + + +cpu_multicorrelator::cpu_multicorrelator() +{ + d_sig_in = NULL; + d_local_code_in = NULL; + d_shifts_chips = NULL; + d_corr_out = NULL; + d_local_codes_resampled = NULL; + d_code_length_chips = 0; + d_n_correlators = 0; +} + + +cpu_multicorrelator::~cpu_multicorrelator() +{ + if(d_local_codes_resampled != NULL) + { + cpu_multicorrelator::free(); + } +} bool cpu_multicorrelator::init( @@ -47,13 +68,7 @@ bool cpu_multicorrelator::init( // ALLOCATE MEMORY FOR INTERNAL vectors size_t size = max_signal_length_samples * sizeof(std::complex); - // NCO signal - d_nco_in = static_cast*>(volk_malloc(size, volk_get_alignment())); - - // Doppler-free signal - d_sig_doppler_wiped = static_cast*>(volk_malloc(size, volk_get_alignment())); - - d_local_codes_resampled = new std::complex*[n_correlators]; + d_local_codes_resampled = static_cast**>(volk_malloc(n_correlators * sizeof(std::complex), volk_get_alignment())); for (int n = 0; n < n_correlators; n++) { d_local_codes_resampled[n] = static_cast*>(volk_malloc(size, volk_get_alignment())); @@ -86,8 +101,7 @@ bool cpu_multicorrelator::set_input_output_vectors(std::complex* corr_out } - -void cpu_multicorrelator::update_local_code(int correlator_length_samples,float rem_code_phase_chips, float code_phase_step_chips) +void cpu_multicorrelator::update_local_code(int correlator_length_samples, float rem_code_phase_chips, float code_phase_step_chips) { float local_code_chip_index; for (int current_correlator_tap = 0; current_correlator_tap < d_n_correlators; current_correlator_tap++) @@ -95,7 +109,7 @@ void cpu_multicorrelator::update_local_code(int correlator_length_samples,float for (int n = 0; n < correlator_length_samples; n++) { // resample code for current tap - local_code_chip_index = std::fmod(code_phase_step_chips*static_cast(n)+ d_shifts_chips[current_correlator_tap] - rem_code_phase_chips, d_code_length_chips); + local_code_chip_index = std::fmod(code_phase_step_chips * static_cast(n) + d_shifts_chips[current_correlator_tap] - rem_code_phase_chips, d_code_length_chips); //Take into account that in multitap correlators, the shifts can be negative! if (local_code_chip_index < 0.0) local_code_chip_index += d_code_length_chips; d_local_codes_resampled[current_correlator_tap][n] = d_local_code_in[static_cast(round(local_code_chip_index))]; @@ -104,20 +118,6 @@ void cpu_multicorrelator::update_local_code(int correlator_length_samples,float } -void cpu_multicorrelator::update_local_carrier(int correlator_length_samples, float rem_carr_phase_rad, float phase_step_rad) -{ - float sin_f, cos_f; - int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad); - int phase_rad_i = gr::fxpt::float_to_fixed(rem_carr_phase_rad); - - for(int i = 0; i < correlator_length_samples; i++) - { - gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f); - d_nco_in[i] = std::complex(cos_f, -sin_f); - phase_rad_i += phase_step_rad_i; - } -} - bool cpu_multicorrelator::Carrier_wipeoff_multicorrelator_resampler( float rem_carrier_phase_in_rad, float phase_step_rad, @@ -125,44 +125,24 @@ bool cpu_multicorrelator::Carrier_wipeoff_multicorrelator_resampler( float code_phase_step_chips, int signal_length_samples) { - //update_local_carrier(signal_length_samples, rem_carrier_phase_in_rad, phase_step_rad); //replaced by VOLK phase rotator - //volk_32fc_x2_multiply_32fc(d_sig_doppler_wiped, d_sig_in, d_nco_in, signal_length_samples); //replaced by VOLK phase rotator - + update_local_code(signal_length_samples, rem_code_phase_chips, code_phase_step_chips); + // Regenerate phase at each call in order to avoid numerical issues lv_32fc_t phase_offset_as_complex[1]; phase_offset_as_complex[0] = lv_cmake(std::cos(rem_carrier_phase_in_rad), -std::sin(rem_carrier_phase_in_rad)); - volk_32fc_s32fc_x2_rotator_32fc(d_sig_doppler_wiped, d_sig_in, std::exp(lv_32fc_t(0, -phase_step_rad)), phase_offset_as_complex, signal_length_samples); - update_local_code(signal_length_samples,rem_code_phase_chips, code_phase_step_chips); - for (int current_correlator_tap = 0; current_correlator_tap < d_n_correlators; current_correlator_tap++) - { - volk_32fc_x2_dot_prod_32fc(&d_corr_out[current_correlator_tap], d_sig_doppler_wiped, d_local_codes_resampled[current_correlator_tap], signal_length_samples); - } + // call VOLK_GNSSSDR kernel + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn(d_corr_out, d_sig_in, std::exp(lv_32fc_t(0, - phase_step_rad)), phase_offset_as_complex, (const lv_32fc_t**)d_local_codes_resampled, d_n_correlators, signal_length_samples); return true; } -cpu_multicorrelator::cpu_multicorrelator() -{ - d_sig_in = NULL; - d_nco_in = NULL; - d_sig_doppler_wiped = NULL; - d_local_code_in = NULL; - d_shifts_chips = NULL; - d_corr_out = NULL; - d_local_codes_resampled = NULL; - d_code_length_chips = 0; - d_n_correlators = 0; -} - bool cpu_multicorrelator::free() { // Free memory - if (d_sig_doppler_wiped != NULL) volk_free(d_sig_doppler_wiped); - if (d_nco_in != NULL) volk_free(d_nco_in); for (int n = 0; n < d_n_correlators; n++) { volk_free(d_local_codes_resampled[n]); } - delete d_local_codes_resampled; + volk_free(d_local_codes_resampled); return true; } diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator.h b/src/algorithms/tracking/libs/cpu_multicorrelator.h index 2c8c9667c..2174595b9 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator.h +++ b/src/algorithms/tracking/libs/cpu_multicorrelator.h @@ -45,27 +45,23 @@ class cpu_multicorrelator { public: cpu_multicorrelator(); + ~cpu_multicorrelator(); bool init(int max_signal_length_samples, int n_correlators); bool set_local_code_and_taps(int code_length_chips, const std::complex* local_code_in, float *shifts_chips); bool set_input_output_vectors(std::complex* corr_out, const std::complex* sig_in); void update_local_code(int correlator_length_samples, float rem_code_phase_chips, float code_phase_step_chips); - void update_local_carrier(int correlator_length_samples, float rem_carr_phase_rad, float phase_step_rad); bool Carrier_wipeoff_multicorrelator_resampler(float rem_carrier_phase_in_rad, float phase_step_rad, float rem_code_phase_chips, float code_phase_step_chips, int signal_length_samples); bool free(); private: // Allocate the device input vectors const std::complex *d_sig_in; - std::complex *d_nco_in; std::complex **d_local_codes_resampled; - std::complex *d_sig_doppler_wiped; const std::complex *d_local_code_in; std::complex *d_corr_out; float *d_shifts_chips; int d_code_length_chips; int d_n_correlators; - bool update_local_code(); - bool update_local_carrier(); }; diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc index 8cbc574dd..6af61754a 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc +++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc @@ -44,14 +44,10 @@ bool cpu_multicorrelator_16sc::init( // ALLOCATE MEMORY FOR INTERNAL vectors size_t size = max_signal_length_samples * sizeof(lv_16sc_t); - // NCO signal (not needed if the rotator+dot_product kernel is used) - //d_nco_in = static_cast(volk_gnsssdr_malloc(size, volk_gnsssdr_get_alignment())); - // Doppler-free signal (not needed if the rotator+dot_product kernel is used) - //d_sig_doppler_wiped = static_cast(volk_gnsssdr_malloc(size, volk_gnsssdr_get_alignment())); - d_n_correlators = n_correlators; - d_tmp_code_phases_chips = static_cast(volk_gnsssdr_malloc(n_correlators*sizeof(float), volk_gnsssdr_get_alignment())); - d_local_codes_resampled = new lv_16sc_t*[n_correlators]; + d_tmp_code_phases_chips = static_cast(volk_gnsssdr_malloc(n_correlators * sizeof(float), volk_gnsssdr_get_alignment())); + + d_local_codes_resampled = static_cast(volk_gnsssdr_malloc(n_correlators * sizeof(lv_16sc_t), volk_gnsssdr_get_alignment())); for (int n = 0; n < n_correlators; n++) { d_local_codes_resampled[n] = static_cast(volk_gnsssdr_malloc(size, volk_gnsssdr_get_alignment())); @@ -81,21 +77,21 @@ bool cpu_multicorrelator_16sc::set_input_output_vectors(lv_16sc_t* corr_out, con return true; } -void cpu_multicorrelator_16sc::update_local_code(int correlator_length_samples,float rem_code_phase_chips, float code_phase_step_chips) + +void cpu_multicorrelator_16sc::update_local_code(int correlator_length_samples, float rem_code_phase_chips, float code_phase_step_chips) { + for (int n = 0; n < d_n_correlators; n++) + { + d_tmp_code_phases_chips[n] = d_shifts_chips[n] - rem_code_phase_chips; + } - for (int n = 0; n < d_n_correlators; n++) - { - d_tmp_code_phases_chips[n] = d_shifts_chips[n] - rem_code_phase_chips; - } - - volk_gnsssdr_16ic_xn_resampler_16ic_xn(d_local_codes_resampled, - d_local_code_in, - d_tmp_code_phases_chips, - code_phase_step_chips, - correlator_length_samples, - d_n_correlators, - d_code_length_chips); + volk_gnsssdr_16ic_xn_resampler_16ic_xn(d_local_codes_resampled, + d_local_code_in, + d_tmp_code_phases_chips, + code_phase_step_chips, + correlator_length_samples, + d_n_correlators, + d_code_length_chips); } @@ -107,13 +103,11 @@ bool cpu_multicorrelator_16sc::Carrier_wipeoff_multicorrelator_resampler( int signal_length_samples) { update_local_code(signal_length_samples, rem_code_phase_chips, code_phase_step_chips); + // Regenerate phase at each call in order to avoid numerical issues lv_32fc_t phase_offset_as_complex[1]; phase_offset_as_complex[0] = lv_cmake(std::cos(rem_carrier_phase_in_rad), -std::sin(rem_carrier_phase_in_rad)); - //replaced by integrated rotator + dot_product kernel - //volk_gnsssdr_16ic_s32fc_x2_rotator_16ic(d_sig_doppler_wiped, d_sig_in, std::exp(lv_32fc_t(0, -phase_step_rad)), phase_offset_as_complex, signal_length_samples); - //volk_gnsssdr_16ic_x2_dot_prod_16ic_xn(d_corr_out, d_sig_doppler_wiped, (const lv_16sc_t**)d_local_codes_resampled, d_n_correlators, signal_length_samples); - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn(d_corr_out, d_sig_in, std::exp(lv_32fc_t(0, -phase_step_rad)), phase_offset_as_complex, (const lv_16sc_t**)d_local_codes_resampled, d_n_correlators, signal_length_samples); - + // call VOLK_GNSSSDR kernel + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn(d_corr_out, d_sig_in, std::exp(lv_32fc_t(0, -phase_step_rad)), phase_offset_as_complex, (const lv_16sc_t**)d_local_codes_resampled, d_n_correlators, signal_length_samples); return true; } @@ -121,8 +115,6 @@ bool cpu_multicorrelator_16sc::Carrier_wipeoff_multicorrelator_resampler( cpu_multicorrelator_16sc::cpu_multicorrelator_16sc() { d_sig_in = NULL; - //d_nco_in = NULL; - //d_sig_doppler_wiped = NULL; d_local_code_in = NULL; d_shifts_chips = NULL; d_corr_out = NULL; @@ -131,17 +123,25 @@ cpu_multicorrelator_16sc::cpu_multicorrelator_16sc() d_n_correlators = 0; } + +cpu_multicorrelator_16sc::~cpu_multicorrelator_16sc() +{ + if(d_local_codes_resampled != NULL) + { + cpu_multicorrelator_16sc::free(); + } +} + + bool cpu_multicorrelator_16sc::free() { // Free memory - //if (d_sig_doppler_wiped != NULL) volk_gnsssdr_free(d_sig_doppler_wiped); - //if (d_nco_in != NULL) volk_gnsssdr_free(d_nco_in); - if (d_tmp_code_phases_chips != NULL) volk_gnsssdr_free(d_tmp_code_phases_chips); + if (d_tmp_code_phases_chips != NULL) volk_gnsssdr_free(d_tmp_code_phases_chips); for (int n = 0; n < d_n_correlators; n++) { volk_gnsssdr_free(d_local_codes_resampled[n]); } - delete d_local_codes_resampled; + volk_gnsssdr_free(d_local_codes_resampled); return true; } diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h index 6a6082168..280e2abe9 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h +++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h @@ -45,6 +45,7 @@ class cpu_multicorrelator_16sc { public: cpu_multicorrelator_16sc(); + ~cpu_multicorrelator_16sc(); bool init(int max_signal_length_samples, int n_correlators); bool set_local_code_and_taps(int code_length_chips, const lv_16sc_t* local_code_in, float *shifts_chips); bool set_input_output_vectors(lv_16sc_t* corr_out, const lv_16sc_t* sig_in); @@ -55,18 +56,14 @@ public: private: // Allocate the device input vectors const lv_16sc_t *d_sig_in; - //lv_16sc_t *d_nco_in; float *d_tmp_code_phases_chips; lv_16sc_t **d_local_codes_resampled; - //lv_16sc_t *d_sig_doppler_wiped; const lv_16sc_t *d_local_code_in; lv_16sc_t *d_corr_out; float *d_shifts_chips; int d_code_length_chips; int d_n_correlators; - bool update_local_code(); - bool update_local_carrier(); }; -#endif /* CPU_MULTICORRELATOR_H_ */ +#endif /* GNSS_SDR_CPU_MULTICORRELATOR_H_ */