From fe241c8b2d8375ba40771d4b1bba75e9247bab67 Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Thu, 16 Nov 2017 20:54:04 +0100 Subject: [PATCH] clean kernel --- .../volk_gnsssdr_16ic_conjugate_16ic.h | 376 ++++++------------ 1 file changed, 116 insertions(+), 260 deletions(-) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h index 8745b40d3..5aae17266 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h @@ -1,16 +1,16 @@ /*! * \file volk_gnsssdr_16ic_conjugate_16ic.h - * \brief VOLK_GNSSSDR kernel: calculates the conjugate of a 16 bits vector. + * \brief VOLK_GNSSSDR kernel: returns the conjugate of a 16 bits complex vector. * \authors * * VOLK_GNSSSDR kernel that calculates the conjugate of a - * 16 bits vector (8 bits the real part and 8 bits the imaginary part) + * 16 bits complex vector (16 bits the real part and 16 bits the imaginary part) * * ------------------------------------------------------------------------- * - * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * Copyright (C) 2010-2017 (see AUTHORS file for a list of contributors) * * GNSS-SDR is a software defined Global Navigation * Satellite Systems receiver @@ -38,7 +38,7 @@ * * \b Overview * - * Takes the conjugate of a complex unsigned char vector. + * Takes the conjugate of a complex signed 16-bit integer vector. * * Dispatcher Prototype * \code @@ -60,6 +60,116 @@ #include +#ifdef LV_HAVE_GENERIC + +static inline void volk_gnsssdr_16ic_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) +{ + lv_16sc_t* cPtr = cVector; + const lv_16sc_t* aPtr = aVector; + unsigned int number; + + for(number = 0; number < num_points; number++) + { + *cPtr++ = lv_conj(*aPtr++); + } +} + +#endif /* LV_HAVE_GENERIC */ + + +#ifdef LV_HAVE_SSSE3 +#include + +static inline void volk_gnsssdr_16ic_conjugate_16ic_u_ssse3(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 4; + unsigned int i; + lv_16sc_t* c = cVector; + const lv_16sc_t* a = aVector; + __m128i tmp; + + __m128i conjugator = _mm_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1); + + for (i = 0; i < sse_iters; ++i) + { + tmp = _mm_lddqu_si128((__m128i*)a); + tmp = _mm_sign_epi16(tmp, conjugator); + _mm_storeu_si128((__m128i*)c, tmp); + a += 4; + c += 4; + } + + for (i = sse_iters * 4; i < num_points; ++i) + { + *c++ = lv_conj(*a++); + } +} + +#endif /* LV_HAVE_SSSE3 */ + + +#ifdef LV_HAVE_SSSE3 +#include + +static inline void volk_gnsssdr_16ic_conjugate_16ic_a_ssse3(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 4; + unsigned int i; + lv_16sc_t* c = cVector; + const lv_16sc_t* a = aVector; + __m128i tmp; + __m128i conjugator = _mm_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1); + + for (i = 0; i < sse_iters; ++i) + { + tmp = _mm_load_si128((__m128i*)a); + tmp = _mm_sign_epi16(tmp, conjugator); + _mm_store_si128((__m128i*)c, tmp); + a += 4; + c += 4; + } + + for (i = sse_iters * 4; i < num_points; ++i) + { + *c++ = lv_conj(*a++); + } +} + +#endif /* LV_HAVE_SSSE3 */ + + +#ifdef LV_HAVE_AVX2 +#include + +static inline void volk_gnsssdr_16ic_conjugate_16ic_a_avx2(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) +{ + const unsigned int avx2_iters = num_points / 8; + unsigned int i; + lv_16sc_t* c = cVector; + const lv_16sc_t* a = aVector; + + __m256i tmp; + __m256i conjugator = _mm256_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); + + for (i = 0; i < avx2_iters; ++i) + { + tmp = _mm256_load_si256((__m256i*)a); + tmp = _mm256_sign_epi16(tmp, conjugator); + _mm256_store_si256((__m256i*)c, tmp); + + a += 8; + c += 8; + } + + for (i = avx2_iters * 8; i < num_points; ++i) + { + *c++ = lv_conj(*a++); + } +} + +#endif /* LV_HAVE_AVX2 */ + + #ifdef LV_HAVE_AVX2 #include @@ -90,260 +200,6 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c } #endif /* LV_HAVE_AVX2 */ - -//#ifdef LV_HAVE_AVX -//#include -//static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) -//{ -// const unsigned int sse_iters = num_points / 8; -// unsigned int i; -// lv_16sc_t* c = cVector; -// const lv_16sc_t* a = aVector; -// -// __m256i tmp; -// //__m128i tmp128lo, tmp128hi; -// //__m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi16(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF)); -// //__m128i conjugator2 = _mm_setr_epi16(0, -1, 0, -1, 0, -1, 0, -1); -// __m256i conjugator = _mm256_setr_ps(0, -1, 0, -1, 0, -1, 0, -1); -// -// for (i = 0; i < sse_iters; ++i) -// { -// tmp = _mm256_loadu_si256((__m256i*)a); -// tmp = _mm256_xor_si256(tmp, conjugator); -// //tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); -// //tmp128lo = _mm_add_epi16(tmp128lo, conjugator2); -// //tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); -// //tmp128hi = _mm_add_epi16(tmp128hi, conjugator2); -// //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h -// //tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); -// _mm256_storeu_si256((__m256i*)c, tmp); -// -// a += 8; -// c += 8; -// } -// -// for (i = sse_iters * 8; i < num_points; ++i) -// { -// *c++ = lv_conj(*a++); -// } -//} -//#endif /* LV_HAVE_AVX */ - - -#ifdef LV_HAVE_SSSE3 -#include - -static inline void volk_gnsssdr_16ic_conjugate_16ic_u_ssse3(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) -{ - const unsigned int sse_iters = num_points / 4; - unsigned int i; - lv_16sc_t* c = cVector; - const lv_16sc_t* a = aVector; - __m128i tmp; - - __m128i conjugator = _mm_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1); - - for (i = 0; i < sse_iters; ++i) - { - tmp = _mm_lddqu_si128((__m128i*)a); - tmp = _mm_sign_epi16(tmp, conjugator); - _mm_storeu_si128((__m128i*)c, tmp); - a += 4; - c += 4; - } - - for (i = sse_iters * 4; i < num_points; ++i) - { - *c++ = lv_conj(*a++); - } -} - -#endif /* LV_HAVE_SSSE3 */ -// -// -//#ifdef LV_HAVE_SSE3 -//#include -// -//static inline void volk_gnsssdr_16ic_conjugate_16ic_u_sse3(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) -//{ -// const unsigned int sse_iters = num_points / 8; -// unsigned int i; -// lv_16sc_t* c = cVector; -// const lv_16sc_t* a = aVector; -// __m128i tmp; -// -// __m128i conjugator1 = _mm_setr_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); -// __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); -// -// for (i = 0; i < sse_iters; ++i) -// { -// tmp = _mm_lddqu_si128((__m128i*)a); -// tmp = _mm_xor_si128(tmp, conjugator1); -// tmp = _mm_add_epi8(tmp, conjugator2); -// _mm_storeu_si128((__m128i*)c, tmp); -// a += 8; -// c += 8; -// } -// -// for (i = sse_iters * 8; i < num_points; ++i) -// { -// *c++ = lv_conj(*a++); -// } -// -//} -//#endif /* LV_HAVE_SSE3 */ - - -#ifdef LV_HAVE_GENERIC - -static inline void volk_gnsssdr_16ic_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) -{ - lv_16sc_t* cPtr = cVector; - const lv_16sc_t* aPtr = aVector; - unsigned int number; - - for(number = 0; number < num_points; number++) - { - *cPtr++ = lv_conj(*aPtr++); - } -} -#endif /* LV_HAVE_GENERIC */ - - -//#ifdef LV_HAVE_AVX -//#include -// -//static inline void volk_gnsssdr_16ic_conjugate_16ic_a_avx(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) -//{ -// const unsigned int sse_iters = num_points / 16; -// unsigned int i; -// lv_16sc_t* c = cVector; -// const lv_16sc_t* a = aVector; -// -// __m256 tmp; -// __m128i tmp128lo, tmp128hi; -// __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF)); -// __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); -// -// for (i = 0; i < sse_iters; ++i) -// { -// tmp = _mm256_load_ps((float*)a); -// tmp = _mm256_xor_ps(tmp, conjugator1); -// tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); -// tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); -// tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); -// tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); -// //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h -// tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); -// _mm256_store_ps((float*)c, tmp); -// -// a += 16; -// c += 16; -// } -// -// for (i = sse_iters * 16; i < num_points; ++i) -// { -// *c++ = lv_conj(*a++); -// } -//} -//#endif /* LV_HAVE_AVX */ -// -// -#ifdef LV_HAVE_AVX2 -#include - -static inline void volk_gnsssdr_16ic_conjugate_16ic_a_avx2(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) -{ - const unsigned int avx2_iters = num_points / 8; - unsigned int i; - lv_16sc_t* c = cVector; - const lv_16sc_t* a = aVector; - - __m256i tmp; - __m256i conjugator = _mm256_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); - - for (i = 0; i < avx2_iters; ++i) - { - tmp = _mm256_load_si256((__m256i*)a); - tmp = _mm256_sign_epi16(tmp, conjugator); - _mm256_store_si256((__m256i*)c, tmp); - - a += 8; - c += 8; - } - - for (i = avx2_iters * 8; i < num_points; ++i) - { - *c++ = lv_conj(*a++); - } -} - -#endif /* LV_HAVE_AVX2 */ -// -// -#ifdef LV_HAVE_SSSE3 -#include - -static inline void volk_gnsssdr_16ic_conjugate_16ic_a_ssse3(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) -{ - const unsigned int sse_iters = num_points / 4; - unsigned int i; - lv_16sc_t* c = cVector; - const lv_16sc_t* a = aVector; - __m128i tmp; - __m128i conjugator = _mm_setr_epi16(1, -1, 1, -1, 1, -1, 1, -1); - - for (i = 0; i < sse_iters; ++i) - { - tmp = _mm_load_si128((__m128i*)a); - tmp = _mm_sign_epi16(tmp, conjugator); - _mm_store_si128((__m128i*)c, tmp); - a += 4; - c += 4; - } - - for (i = sse_iters * 4; i < num_points; ++i) - { - *c++ = lv_conj(*a++); - } -} - -#endif /* LV_HAVE_SSSE3 */ -// -// -//#ifdef LV_HAVE_SSE3 -//#include -// -//static inline void volk_gnsssdr_16ic_conjugate_16ic_a_sse3(lv_16sc_t* cVector, const lv_16sc_t* aVector, unsigned int num_points) -//{ -// const unsigned int sse_iters = num_points / 8; -// unsigned int i; -// lv_16sc_t* c = cVector; -// const lv_16sc_t* a = aVector; -// __m128i tmp; -// -// __m128i conjugator1 = _mm_setr_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); -// __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); -// -// for (i = 0; i < sse_iters; ++i) -// { -// tmp = _mm_load_si128((__m128i*)a); -// tmp = _mm_xor_si128(tmp, conjugator1); -// tmp = _mm_add_epi8(tmp, conjugator2); -// _mm_store_si128((__m128i*)c, tmp); -// a += 8; -// c += 8; -// } -// -// for (i = sse_iters * 8; i < num_points; ++i) -// { -// *c++ = lv_conj(*a++); -// } -// -//} -//#endif /* LV_HAVE_SSE3 */ -// -// // // //#ifdef LV_HAVE_NEON @@ -360,7 +216,7 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_a_ssse3(lv_16sc_t* cVector, // for (i = 0; i < sse_iters; ++i) // { // a_val = vld2_s16((const int16_t*)a); -// __VOLK_GNSSSDR_PREFETCH(a + 16); +// __VOLK_GNSSSDR_PREFETCH(a + 4); // a_val.val[1] = vneg_s16(a_val.val[1]); // vst2_s16((int16_t*)c, a_val); // a += 4;