mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-26 13:07:39 +00:00 
			
		
		
		
	introducing new kernels
This commit is contained in:
		| @@ -0,0 +1,170 @@ | ||||
| /*! | ||||
|  * \file volk_gnsssdr_16ic_resampler_16ic.h | ||||
|  * \brief Volk protokernel: resample a 16 bits complex vector | ||||
|  * \authors <ul> | ||||
|  *          <li> Javier Arribas, 2015. jarribas(at)cttc.es | ||||
|  *          </ul> | ||||
|  * | ||||
|  * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part  | ||||
|  * and 8 bits the imaginary part) and accumulates them | ||||
|  * | ||||
|  * ------------------------------------------------------------------------- | ||||
|  * | ||||
|  * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors) | ||||
|  * | ||||
|  * GNSS-SDR is a software defined Global Navigation | ||||
|  *          Satellite Systems receiver | ||||
|  * | ||||
|  * This file is part of GNSS-SDR. | ||||
|  * | ||||
|  * GNSS-SDR is free software: you can redistribute it and/or modify | ||||
|  * it under the terms of the GNU General Public License as published by | ||||
|  * the Free Software Foundation, either version 3 of the License, or | ||||
|  * (at your option) any later version. | ||||
|  * | ||||
|  * GNSS-SDR is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|  * GNU General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU General Public License | ||||
|  * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>. | ||||
|  * | ||||
|  * ------------------------------------------------------------------------- | ||||
|  */ | ||||
|  | ||||
| #ifndef INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H | ||||
| #define INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H | ||||
|  | ||||
| #include <math.h> | ||||
| #include <volk_gnsssdr/volk_gnsssdr_common.h> | ||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> | ||||
| #include <volk_gnsssdr/saturated_arithmetic.h> | ||||
|  | ||||
| //#pragma STDC FENV_ACCESS ON | ||||
|  | ||||
| #ifdef LV_HAVE_GENERIC | ||||
|  | ||||
| //int round_int( float r ) { | ||||
| //    return (r > 0.0) ? (r + 0.5) : (r - 0.5); | ||||
| //} | ||||
| /*! | ||||
|  \brief Multiplies the two input complex vectors, point-by-point, storing the result in the third vector | ||||
|  \param cVector The vector where the result will be stored | ||||
|  \param aVector One of the vectors to be multiplied | ||||
|  \param bVector One of the vectors to be multiplied | ||||
|  \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector | ||||
|  */ | ||||
| static inline void volk_gnsssdr_16ic_resampler_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips) | ||||
| { | ||||
|     int local_code_chip_index; | ||||
|     //fesetround(FE_TONEAREST); | ||||
|     for (unsigned int n = 0; n < num_output_samples; n++) | ||||
|         { | ||||
|             // resample code for current tap | ||||
|             local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f); | ||||
|             if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips; | ||||
|             if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips; | ||||
|             //std::cout<<"g["<<n<<"]="<<code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f<<","<<local_code_chip_index<<" "; | ||||
|             result[n] = local_code[local_code_chip_index]; | ||||
|         } | ||||
|     //std::cout<<std::endl; | ||||
| } | ||||
|  | ||||
| #endif /*LV_HAVE_GENERIC*/ | ||||
|  | ||||
|  | ||||
| #ifdef LV_HAVE_SSE2 | ||||
| #include <emmintrin.h> | ||||
| static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips)//, int* scratch_buffer, float* scratch_buffer_float) | ||||
| { | ||||
|     _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO | ||||
|     unsigned int number; | ||||
|     const unsigned int quarterPoints = num_output_samples / 4; | ||||
|  | ||||
|     lv_16sc_t* _result = result; | ||||
|  | ||||
|     __attribute__((aligned(16))) int local_code_chip_index[4]; | ||||
|     __m128 _rem_code_phase, _code_phase_step_chips; | ||||
|     __m128i _code_length_chips, _code_length_chips_minus1; | ||||
|     __m128 _code_phase_out, _code_phase_out_with_offset; | ||||
|     rem_code_phase_chips = rem_code_phase_chips - 0.5f; | ||||
|  | ||||
|     _rem_code_phase =  _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register | ||||
|     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register | ||||
|     __attribute__((aligned(16))) int four_times_code_length_chips_minus1[4]; | ||||
|     four_times_code_length_chips_minus1[0] = code_length_chips-1; | ||||
|     four_times_code_length_chips_minus1[1] = code_length_chips-1; | ||||
|     four_times_code_length_chips_minus1[2] = code_length_chips-1; | ||||
|     four_times_code_length_chips_minus1[3] = code_length_chips-1; | ||||
|  | ||||
|     __attribute__((aligned(16))) int four_times_code_length_chips[4]; | ||||
|     four_times_code_length_chips[0] = code_length_chips; | ||||
|     four_times_code_length_chips[1] = code_length_chips; | ||||
|     four_times_code_length_chips[2] = code_length_chips; | ||||
|     four_times_code_length_chips[3] = code_length_chips; | ||||
|  | ||||
|     _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register | ||||
|     _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register | ||||
|  | ||||
|     __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; | ||||
|  | ||||
|     __m128i zero = _mm_setzero_si128(); | ||||
|  | ||||
|     __attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; | ||||
|     __m128 _4output_index = _mm_load_ps(init_idx_float); | ||||
|     __attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; | ||||
|     __m128 _4constant_float = _mm_load_ps(init_4constant_float); | ||||
|  | ||||
|     //__attribute__((aligned(16))) int output_indexes[4]; | ||||
|  | ||||
|     for(number = 0; number < quarterPoints; number++) | ||||
|         { | ||||
|             _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step | ||||
|             _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset | ||||
|             _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer | ||||
|  | ||||
|             negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values | ||||
|             _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch | ||||
|             //_code_phase_out_int_over=_mm_or_si128(_mm_and_si128(_code_phase_out_int_neg,_code_phase_out_int),_mm_andnot_si128(negative_indexes,_code_phase_out_int)); | ||||
|             _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); | ||||
|  | ||||
|             overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values | ||||
|             _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch | ||||
|             _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); | ||||
|  | ||||
|             _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back | ||||
|  | ||||
|             //_mm_store_ps((float*)_scratch_buffer_float,_code_phase_out_with_offset); | ||||
|  | ||||
|             //todo: optimize the local code lookup table with intrinsics, if possible | ||||
|             *_result++ = local_code[local_code_chip_index[0]]; | ||||
|             *_result++ = local_code[local_code_chip_index[1]]; | ||||
|             *_result++ = local_code[local_code_chip_index[2]]; | ||||
|             *_result++ = local_code[local_code_chip_index[3]]; | ||||
|  | ||||
|             _4output_index = _mm_add_ps(_4output_index, _4constant_float); | ||||
|             //_scratch_buffer_float+=4; | ||||
|  | ||||
|         } | ||||
|  | ||||
|     for(number = quarterPoints * 4; number < num_output_samples; number++) | ||||
|         { | ||||
|             local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); | ||||
|             if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; | ||||
|             if (local_code_chip_index[0] > (code_length_chips - 1)) local_code_chip_index[0] -= code_length_chips; | ||||
|             *_result++ = local_code[local_code_chip_index[0]]; | ||||
|             //*_scratch_buffer_float++=code_phase_step_chips*static_cast<float>(number)+rem_code_phase_chips; | ||||
|         } | ||||
|  | ||||
|     //    for(unsigned int n=0;n<num_output_samples;n++) | ||||
|     //    { | ||||
|     // | ||||
|     //		std::cout<<"s["<<n<<"]="<<scratch_buffer_float[n]<<","<<scratch_buffer[n]<<" "; | ||||
|     //    } | ||||
|     //    std::cout<<std::endl; | ||||
| } | ||||
|  | ||||
| #endif /* LV_HAVE_SSE2 */ | ||||
|  | ||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H*/ | ||||
| @@ -0,0 +1,152 @@ | ||||
| /*! | ||||
|  * \file volk_gnsssdr_16ic_x2_dot_prod_16ic.h | ||||
|  * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them | ||||
|  * \authors <ul> | ||||
|  *          <li> Javier Arribas, 2015. jarribas(at)cttc.es | ||||
|  *          </ul> | ||||
|  * | ||||
|  * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part  | ||||
|  * and 8 bits the imaginary part) and accumulates them | ||||
|  * | ||||
|  * ------------------------------------------------------------------------- | ||||
|  * | ||||
|  * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors) | ||||
|  * | ||||
|  * GNSS-SDR is a software defined Global Navigation | ||||
|  *          Satellite Systems receiver | ||||
|  * | ||||
|  * This file is part of GNSS-SDR. | ||||
|  * | ||||
|  * GNSS-SDR is free software: you can redistribute it and/or modify | ||||
|  * it under the terms of the GNU General Public License as published by | ||||
|  * the Free Software Foundation, either version 3 of the License, or | ||||
|  * (at your option) any later version. | ||||
|  * | ||||
|  * GNSS-SDR is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|  * GNU General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU General Public License | ||||
|  * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>. | ||||
|  * | ||||
|  * ------------------------------------------------------------------------- | ||||
|  */ | ||||
|  | ||||
| #ifndef INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H | ||||
| #define INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H | ||||
|  | ||||
| #include <inttypes.h> | ||||
| #include <stdio.h> | ||||
| #include <string.h> | ||||
| #include <volk_gnsssdr/volk_gnsssdr_common.h> | ||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> | ||||
| #include <volk_gnsssdr/saturated_arithmetic.h> | ||||
|  | ||||
|  | ||||
| #ifdef LV_HAVE_GENERIC | ||||
| /*! | ||||
|  \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector | ||||
|  \param cVector The vector where the accumulated result will be stored | ||||
|  \param aVector One of the vectors to be multiplied and accumulated | ||||
|  \param bVector One of the vectors to be multiplied and accumulated | ||||
|  \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector | ||||
|  */ | ||||
| static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) | ||||
| { | ||||
|     result[0] = lv_cmake((int16_t)0, (int16_t)0); | ||||
|     for (unsigned int n = 0; n < num_points; n++) | ||||
|         { | ||||
|             //r*a.r - i*a.i, i*a.r + r*a.i | ||||
|             //result[0]+=in_a[n]*in_b[n]; | ||||
|             lv_16sc_t tmp = in_a[n] * in_b[n]; | ||||
|             result[0] = lv_cmake(sat_adds16b(lv_creal(result[0]), lv_creal(tmp)), sat_adds16b(lv_cimag(result[0]), lv_cimag(tmp) )); | ||||
|             //result[0].real(sat_adds16b(result[0].real(),lv_creal(tmp))); | ||||
|             //result[0].imag(sat_adds16b(result[0].imag(),tmp.imag())); | ||||
|         } | ||||
| } | ||||
|  | ||||
| #endif /*LV_HAVE_GENERIC*/ | ||||
|  | ||||
|  | ||||
| #ifdef LV_HAVE_SSE2 | ||||
| #include <emmintrin.h> | ||||
| static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) | ||||
| { | ||||
|  | ||||
|     lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); | ||||
|  | ||||
|     const unsigned int sse_iters = num_points / 4; | ||||
|  | ||||
|     const lv_16sc_t* _in_a = in_a; | ||||
|     const lv_16sc_t* _in_b = in_b; | ||||
|     lv_16sc_t* _out = out; | ||||
|  | ||||
|     if (sse_iters > 0) | ||||
|         { | ||||
|             __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result; | ||||
|  | ||||
|             realcacc = _mm_setzero_si128(); | ||||
|             imagcacc = _mm_setzero_si128(); | ||||
|  | ||||
|             mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); | ||||
|             mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); | ||||
|  | ||||
|             for(unsigned int number = 0; number < sse_iters; number++) | ||||
|                 { | ||||
|                     //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] | ||||
|                     //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1] | ||||
|                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||
|                     a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||
|                     b = _mm_loadu_si128((__m128i*)_in_b); | ||||
|                     c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... | ||||
|  | ||||
|                     c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. | ||||
|                     real = _mm_subs_epi16 (c,c_sr); | ||||
|  | ||||
|                     b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... | ||||
|                     a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... | ||||
|  | ||||
|                     imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... | ||||
|                     imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... | ||||
|  | ||||
|                     imag = _mm_adds_epi16(imag1, imag2); //with saturation aritmetic! | ||||
|  | ||||
|                     realcacc = _mm_adds_epi16 (realcacc, real); | ||||
|                     imagcacc = _mm_adds_epi16 (imagcacc, imag); | ||||
|  | ||||
|                     _in_a += 4; | ||||
|                     _in_b += 4; | ||||
|                 } | ||||
|  | ||||
|             realcacc = _mm_and_si128 (realcacc, mask_real); | ||||
|             imagcacc = _mm_and_si128 (imagcacc, mask_imag); | ||||
|  | ||||
|             result = _mm_or_si128 (realcacc, imagcacc); | ||||
|  | ||||
|             __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; | ||||
|  | ||||
|             _mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector | ||||
|  | ||||
|             for (int i = 0; i < 4; ++i) | ||||
|                 { | ||||
|                     dotProduct = lv_cmake(sat_adds16b(lv_creal(dotProduct), lv_creal(dotProductVector[i])), sat_adds16b(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); | ||||
|                     //dotProduct.real(sat_adds16b(lv_creal(dotProduct),lv_creal(dotProductVector[i]))); | ||||
|                     //dotProduct.imag(sat_adds16b(lv_cimag(dotProduct),lv_cimag(dotProductVector[i]))); | ||||
|                 } | ||||
|         } | ||||
|  | ||||
|     for (unsigned int i = 0; i < (num_points % 4); ++i) | ||||
|         { | ||||
|             //dotProduct += (*_in_a++) * (*_in_b++); | ||||
|             lv_16sc_t tmp = (*_in_a++) * (*_in_b++); | ||||
|             dotProduct = lv_cmake( sat_adds16b(lv_creal(dotProduct), lv_creal(tmp)), sat_adds16b(lv_cimag(dotProduct), lv_cimag(tmp))); | ||||
|             //dotProduct.real(sat_adds16b(lv_creal(dotProduct),lv_creal(tmp))); | ||||
|             //dotProduct.imag(sat_adds16b(lv_cimag(dotProduct),lv_cimag(tmp))); | ||||
|         } | ||||
|  | ||||
|     *_out = dotProduct; | ||||
| } | ||||
| #endif /* LV_HAVE_SSE2 */ | ||||
|  | ||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H*/ | ||||
| @@ -79,6 +79,8 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t | ||||
|         (VOLK_INIT_TEST(volk_gnsssdr_64f_accumulator_64f, test_params)) | ||||
|         (VOLK_INIT_TEST(volk_gnsssdr_32fc_convert_8ic, test_params)) | ||||
|         (VOLK_INIT_TEST(volk_gnsssdr_32fc_convert_16ic, test_params)) | ||||
|         (VOLK_INIT_TEST(volk_gnsssdr_16ic_x2_dot_prod_16ic, test_params)) | ||||
|         (VOLK_INIT_TEST(volk_gnsssdr_16ic_resampler_16ic, test_params)) | ||||
|         ; | ||||
|  | ||||
|     return test_cases; | ||||
|   | ||||
| @@ -170,6 +170,14 @@ typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const | ||||
| typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); | ||||
| typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); | ||||
|  | ||||
| //typedef void (*volk_gnsssdr_fn_1arg_s16i)(void *, int16_t, unsigned int, const char*); //one input vector, one scalar int16_t input | ||||
| //typedef void (*volk_gnsssdr_fn_2arg_s16i)(void *, void *, int16_t, unsigned int, const char*); | ||||
| //typedef void (*volk_gnsssdr_fn_3arg_s16i)(void *, void *, void *, int16_t, unsigned int, const char*); | ||||
| //typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char*); //one input vector, one scalar lv_16sc_t vector input | ||||
| //typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char*); | ||||
| //typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char*); | ||||
| typedef void (*volk_gnsssdr_fn_6arg_s16ic)(void *, void *, void *, void *, void *, void *, lv_16sc_t, unsigned int, const char*); | ||||
|  | ||||
| typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); | ||||
| typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); | ||||
| typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); | ||||
|   | ||||
| @@ -36,11 +36,14 @@ | ||||
| #include <iostream> | ||||
| #include <gnuradio/fxpt.h>  // fixed point sine and cosine | ||||
|  | ||||
| #include "volk_gnsssdr/volk_gnsssdr.h" | ||||
|  | ||||
| #define LV_HAVE_GENERIC | ||||
| #define LV_HAVE_SSE2 | ||||
| #include "volk_gnsssdr_16ic_x2_dot_prod_16ic.h" | ||||
|  | ||||
| //#include "volk_gnsssdr_16ic_x2_dot_prod_16ic.h" | ||||
| #include "volk_gnsssdr_16ic_x2_multiply_16ic.h" | ||||
| #include "volk_gnsssdr_16ic_resampler_16ic.h" | ||||
| //#include "volk_gnsssdr_16ic_resampler_16ic.h" | ||||
| #include "volk_gnsssdr_16ic_xn_resampler_16ic_xn.h" | ||||
| #include "volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h" | ||||
|  | ||||
|   | ||||
| @@ -1,171 +0,0 @@ | ||||
| /*! | ||||
|  * \file volk_gnsssdr_16ic_resampler_16ic.h | ||||
|  * \brief Volk protokernel: resample a 16 bits complex vector | ||||
|  * \authors <ul> | ||||
|  *          <li> Javier Arribas, 2015. jarribas(at)cttc.es | ||||
|  *          </ul> | ||||
|  * | ||||
|  * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part  | ||||
|  * and 8 bits the imaginary part) and accumulates them | ||||
|  * | ||||
|  * ------------------------------------------------------------------------- | ||||
|  * | ||||
|  * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors) | ||||
|  * | ||||
|  * GNSS-SDR is a software defined Global Navigation | ||||
|  *          Satellite Systems receiver | ||||
|  * | ||||
|  * This file is part of GNSS-SDR. | ||||
|  * | ||||
|  * GNSS-SDR is free software: you can redistribute it and/or modify | ||||
|  * it under the terms of the GNU General Public License as published by | ||||
|  * the Free Software Foundation, either version 3 of the License, or | ||||
|  * (at your option) any later version. | ||||
|  * | ||||
|  * GNSS-SDR is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|  * GNU General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU General Public License | ||||
|  * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>. | ||||
|  * | ||||
|  * ------------------------------------------------------------------------- | ||||
|  */ | ||||
|  | ||||
| #ifndef INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H | ||||
| #define INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H | ||||
|  | ||||
| #include <volk_gnsssdr/volk_gnsssdr_common.h> | ||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> | ||||
| #include <cmath> | ||||
| //#pragma STDC FENV_ACCESS ON | ||||
|  | ||||
| #ifdef LV_HAVE_GENERIC | ||||
|  | ||||
| //int round_int( float r ) { | ||||
| //    return (r > 0.0) ? (r + 0.5) : (r - 0.5); | ||||
| //} | ||||
| /*! | ||||
|  \brief Multiplies the two input complex vectors, point-by-point, storing the result in the third vector | ||||
|  \param cVector The vector where the result will be stored | ||||
|  \param aVector One of the vectors to be multiplied | ||||
|  \param bVector One of the vectors to be multiplied | ||||
|  \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector | ||||
|  */ | ||||
| static inline void volk_gnsssdr_16ic_resampler_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips ,float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips) | ||||
| { | ||||
|     int local_code_chip_index; | ||||
|     //fesetround(FE_TONEAREST); | ||||
|     for (unsigned int n = 0; n < num_output_samples; n++) | ||||
|         { | ||||
| 			// resample code for current tap | ||||
| 			local_code_chip_index = round(code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f); | ||||
| 	    	if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips; | ||||
| 	    	if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips; | ||||
| 			//std::cout<<"g["<<n<<"]="<<code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f<<","<<local_code_chip_index<<" "; | ||||
| 			result[n] = local_code[local_code_chip_index]; | ||||
|         } | ||||
|     //std::cout<<std::endl; | ||||
| } | ||||
|  | ||||
| #endif /*LV_HAVE_GENERIC*/ | ||||
|  | ||||
|  | ||||
| #ifdef LV_HAVE_SSE2 | ||||
| #include <emmintrin.h> | ||||
| static inline void volk_gnsssdr_16ic_resampler_16ic_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips ,float code_phase_step_chips, unsigned int num_output_samples, int code_length_chips)//, int* scratch_buffer, float* scratch_buffer_float) | ||||
| { | ||||
|  | ||||
| 	_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO | ||||
|     unsigned int number; | ||||
|     const unsigned int quarterPoints = num_output_samples / 4; | ||||
|  | ||||
| 	lv_16sc_t* _result = result; | ||||
|  | ||||
|     __attribute__((aligned(16))) int local_code_chip_index[4]; | ||||
|     __m128 _rem_code_phase,_code_phase_step_chips; | ||||
|     __m128i _code_length_chips,_code_length_chips_minus1; | ||||
|     __m128 _code_phase_out,_code_phase_out_with_offset; | ||||
|     rem_code_phase_chips=rem_code_phase_chips-0.5f; | ||||
|  | ||||
|     _rem_code_phase =  _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register | ||||
|     _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register | ||||
|     __attribute__((aligned(16))) int four_times_code_length_chips_minus1[4]; | ||||
|     four_times_code_length_chips_minus1[0]=code_length_chips-1; | ||||
|     four_times_code_length_chips_minus1[1]=code_length_chips-1; | ||||
|     four_times_code_length_chips_minus1[2]=code_length_chips-1; | ||||
|     four_times_code_length_chips_minus1[3]=code_length_chips-1; | ||||
|  | ||||
|     __attribute__((aligned(16))) int four_times_code_length_chips[4]; | ||||
|     four_times_code_length_chips[0]=code_length_chips; | ||||
|     four_times_code_length_chips[1]=code_length_chips; | ||||
|     four_times_code_length_chips[2]=code_length_chips; | ||||
|     four_times_code_length_chips[3]=code_length_chips; | ||||
|  | ||||
|     _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register | ||||
|     _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register | ||||
|  | ||||
|     __m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over; | ||||
|  | ||||
|     __m128i zero=_mm_setzero_si128(); | ||||
|  | ||||
|  | ||||
|     __attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; | ||||
|     __m128 _4output_index=_mm_load_ps(init_idx_float); | ||||
|     __attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; | ||||
|     __m128 _4constant_float=_mm_load_ps(init_4constant_float); | ||||
|  | ||||
|     //__attribute__((aligned(16))) int output_indexes[4]; | ||||
|  | ||||
|     for(number=0;number < quarterPoints; number++){ | ||||
|     	_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step | ||||
|     	_code_phase_out_with_offset = _mm_add_ps(_code_phase_out,_rem_code_phase); //add the phase offset | ||||
|     	_code_phase_out_int=_mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer | ||||
|  | ||||
|     	negative_indexes=_mm_cmplt_epi32 (_code_phase_out_int, zero); //test for negative values | ||||
|     	_code_phase_out_int_neg=_mm_add_epi32(_code_phase_out_int,_code_length_chips); //the negative values branch | ||||
|     	//_code_phase_out_int_over=_mm_or_si128(_mm_and_si128(_code_phase_out_int_neg,_code_phase_out_int),_mm_andnot_si128(negative_indexes,_code_phase_out_int)); | ||||
|     	_code_phase_out_int_neg=_mm_xor_si128(_code_phase_out_int,_mm_and_si128( negative_indexes,_mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); | ||||
|  | ||||
|     	overflow_indexes=_mm_cmpgt_epi32  (_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values | ||||
|     	_code_phase_out_int_over=_mm_sub_epi32(_code_phase_out_int_neg,_code_length_chips); //the negative values branch | ||||
|     	_code_phase_out_int_over=_mm_xor_si128(_code_phase_out_int_neg,_mm_and_si128( overflow_indexes,_mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); | ||||
|  | ||||
|     	_mm_storeu_si128((__m128i*)local_code_chip_index,_code_phase_out_int_over); // Store the results back | ||||
|  | ||||
|     	//_mm_store_ps((float*)_scratch_buffer_float,_code_phase_out_with_offset); | ||||
|  | ||||
|     	//todo: optimize the local code lookup table with intrinsics, if possible | ||||
|     	*_result++=local_code[local_code_chip_index[0]]; | ||||
|     	*_result++=local_code[local_code_chip_index[1]]; | ||||
|     	*_result++=local_code[local_code_chip_index[2]]; | ||||
|     	*_result++=local_code[local_code_chip_index[3]]; | ||||
|  | ||||
|     	_4output_index = _mm_add_ps(_4output_index,_4constant_float); | ||||
|     	//_scratch_buffer_float+=4; | ||||
|  | ||||
|     } | ||||
|  | ||||
|     for(number = quarterPoints * 4;number < num_output_samples; number++){ | ||||
|     	local_code_chip_index[0]=static_cast<int>(code_phase_step_chips*static_cast<float>(number) + rem_code_phase_chips+0.5f); | ||||
|     	if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips-1; | ||||
|     	if (local_code_chip_index[0]  > (code_length_chips-1)) local_code_chip_index[0] -= code_length_chips; | ||||
|     	*_result++=local_code[local_code_chip_index[0]]; | ||||
|     	//*_scratch_buffer_float++=code_phase_step_chips*static_cast<float>(number)+rem_code_phase_chips; | ||||
|     } | ||||
|  | ||||
| //    for(unsigned int n=0;n<num_output_samples;n++) | ||||
| //    { | ||||
| // | ||||
| //		std::cout<<"s["<<n<<"]="<<scratch_buffer_float[n]<<","<<scratch_buffer[n]<<" "; | ||||
| //    } | ||||
| //    std::cout<<std::endl; | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| } | ||||
| #endif /* LV_HAVE_SSE2 */ | ||||
|  | ||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_resampler_16ic_a_H*/ | ||||
| @@ -1,152 +0,0 @@ | ||||
| /*! | ||||
|  * \file volk_gnsssdr_16ic_x2_dot_prod_16ic.h | ||||
|  * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them | ||||
|  * \authors <ul> | ||||
|  *          <li> Javier Arribas, 2015. jarribas(at)cttc.es | ||||
|  *          </ul> | ||||
|  * | ||||
|  * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part  | ||||
|  * and 8 bits the imaginary part) and accumulates them | ||||
|  * | ||||
|  * ------------------------------------------------------------------------- | ||||
|  * | ||||
|  * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors) | ||||
|  * | ||||
|  * GNSS-SDR is a software defined Global Navigation | ||||
|  *          Satellite Systems receiver | ||||
|  * | ||||
|  * This file is part of GNSS-SDR. | ||||
|  * | ||||
|  * GNSS-SDR is free software: you can redistribute it and/or modify | ||||
|  * it under the terms of the GNU General Public License as published by | ||||
|  * the Free Software Foundation, either version 3 of the License, or | ||||
|  * (at your option) any later version. | ||||
|  * | ||||
|  * GNSS-SDR is distributed in the hope that it will be useful, | ||||
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||
|  * GNU General Public License for more details. | ||||
|  * | ||||
|  * You should have received a copy of the GNU General Public License | ||||
|  * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>. | ||||
|  * | ||||
|  * ------------------------------------------------------------------------- | ||||
|  */ | ||||
|  | ||||
| #ifndef INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H | ||||
| #define INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H | ||||
|  | ||||
| #include <volk_gnsssdr/volk_gnsssdr_common.h> | ||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> | ||||
| #include <stdio.h> | ||||
| #include <string.h> | ||||
| #include "saturated_arithmetic.h" | ||||
|  | ||||
| #ifdef LV_HAVE_GENERIC | ||||
| /*! | ||||
|  \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector | ||||
|  \param cVector The vector where the accumulated result will be stored | ||||
|  \param aVector One of the vectors to be multiplied and accumulated | ||||
|  \param bVector One of the vectors to be multiplied and accumulated | ||||
|  \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector | ||||
|  */ | ||||
| static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) | ||||
| { | ||||
| 	result[0]=lv_16sc_t(0,0); | ||||
| 	for (unsigned int n=0;n<num_points;n++) | ||||
| 	{ | ||||
| 		//r*a.r - i*a.i, i*a.r + r*a.i | ||||
| 		//result[0]+=in_a[n]*in_b[n]; | ||||
| 		lv_16sc_t tmp=in_a[n]*in_b[n]; | ||||
| 		result[0].real(sat_adds16b(result[0].real(),tmp.real())); | ||||
| 		result[0].imag(sat_adds16b(result[0].imag(),tmp.imag())); | ||||
| 	} | ||||
| } | ||||
|  | ||||
| #endif /*LV_HAVE_GENERIC*/ | ||||
|  | ||||
|  | ||||
| #ifdef LV_HAVE_SSE2 | ||||
| #include <emmintrin.h> | ||||
| static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) | ||||
| { | ||||
|  | ||||
| 	lv_16sc_t dotProduct=lv_16sc_t(0,0); | ||||
|  | ||||
|     const unsigned int sse_iters = num_points / 4; | ||||
|  | ||||
| 	const lv_16sc_t* _in_a = in_a; | ||||
| 	const lv_16sc_t* _in_b = in_b; | ||||
| 	lv_16sc_t* _out = out; | ||||
|  | ||||
|     if (sse_iters>0) | ||||
|         { | ||||
|  | ||||
|  | ||||
| 		__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, realcacc, imagcacc, result; | ||||
|  | ||||
|         realcacc = _mm_setzero_si128(); | ||||
|         imagcacc = _mm_setzero_si128(); | ||||
|  | ||||
|         mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); | ||||
|         mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); | ||||
|  | ||||
| 		for(unsigned int number = 0;number < sse_iters; number++) | ||||
| 			{ | ||||
| 	    	//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] | ||||
| 	    	//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1] | ||||
| 	    	// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||
| 	            a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||
| 	            b = _mm_loadu_si128((__m128i*)_in_b); | ||||
| 	            c=_mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... | ||||
|  | ||||
| 	            c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. | ||||
| 	            real = _mm_subs_epi16 (c,c_sr); | ||||
|  | ||||
| 	            b_sl = _mm_slli_si128(b,2); // b3.r, b2.i .... | ||||
| 	            a_sl = _mm_slli_si128(a,2); // a3.r, a2.i .... | ||||
|  | ||||
| 	            imag1 = _mm_mullo_epi16(a,b_sl); // a3.i*b3.r, .... | ||||
| 	            imag2 = _mm_mullo_epi16(b,a_sl); // b3.i*a3.r, .... | ||||
|  | ||||
| 	            imag = _mm_adds_epi16(imag1,imag2); //with saturation aritmetic! | ||||
|  | ||||
|                 realcacc = _mm_adds_epi16 (realcacc, real); | ||||
|                 imagcacc = _mm_adds_epi16 (imagcacc, imag); | ||||
|  | ||||
| 				_in_a += 4; | ||||
| 				_in_b += 4; | ||||
|  | ||||
| 			} | ||||
|  | ||||
|  | ||||
| 			realcacc = _mm_and_si128 (realcacc, mask_real); | ||||
| 			imagcacc = _mm_and_si128 (imagcacc, mask_imag); | ||||
|  | ||||
| 			result = _mm_or_si128 (realcacc, imagcacc); | ||||
|  | ||||
| 			__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; | ||||
|  | ||||
| 			_mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector | ||||
|  | ||||
| 			for (int i = 0; i<4; ++i) | ||||
| 				{ | ||||
| 					dotProduct.real(sat_adds16b(dotProduct.real(),dotProductVector[i].real())); | ||||
| 					dotProduct.imag(sat_adds16b(dotProduct.imag(),dotProductVector[i].imag())); | ||||
| 				} | ||||
|         } | ||||
|  | ||||
|     for (unsigned int i = 0; i<(num_points % 4); ++i) | ||||
|         { | ||||
|             //dotProduct += (*_in_a++) * (*_in_b++); | ||||
|     		lv_16sc_t tmp=(*_in_a++) * (*_in_b++); | ||||
| 			dotProduct.real(sat_adds16b(dotProduct.real(),tmp.real())); | ||||
| 			dotProduct.imag(sat_adds16b(dotProduct.imag(),tmp.imag())); | ||||
|  | ||||
|         } | ||||
|  | ||||
|     *_out = dotProduct; | ||||
| } | ||||
| #endif /* LV_HAVE_SSE2 */ | ||||
|  | ||||
| #endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_u_H*/ | ||||
| @@ -38,7 +38,7 @@ | ||||
|  | ||||
|  | ||||
| #include <volk_gnsssdr/volk_gnsssdr_complex.h> | ||||
| #include "saturated_arithmetic.h" | ||||
| #include <volk_gnsssdr/saturated_arithmetic.h> | ||||
|  | ||||
| #ifdef LV_HAVE_GENERIC | ||||
| /*! | ||||
| @@ -50,18 +50,17 @@ | ||||
|  */ | ||||
| static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_generic(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, unsigned int num_points, int num_a_vectors) | ||||
| { | ||||
| 	for (int n_vec=0;n_vec<num_a_vectors;n_vec++) | ||||
| 	{ | ||||
| 		result[n_vec]=lv_cmake(0,0); | ||||
| 		for (unsigned int n=0;n<num_points;n++) | ||||
| 		{ | ||||
| 			//r*a.r - i*a.i, i*a.r + r*a.i | ||||
| 			//result[n_vec]+=in_common[n]*in_a[n_vec][n]; | ||||
| 			lv_16sc_t tmp=in_common[n]*in_a[n_vec][n]; | ||||
| 			result[n_vec]=lv_cmake(sat_adds16b(lv_creal(result[n_vec]),lv_creal(tmp)),sat_adds16b(lv_cimag(result[n_vec]),lv_cimag(tmp))); | ||||
| 		} | ||||
|  | ||||
| 	} | ||||
|     for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||
|         { | ||||
|             result[n_vec] = lv_cmake(0,0); | ||||
|             for (unsigned int n = 0; n < num_points; n++) | ||||
|                 { | ||||
|                     //r*a.r - i*a.i, i*a.r + r*a.i | ||||
|                     //result[n_vec]+=in_common[n]*in_a[n_vec][n]; | ||||
|                     lv_16sc_t tmp = in_common[n]*in_a[n_vec][n]; | ||||
|                     result[n_vec] = lv_cmake(sat_adds16b(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16b(lv_cimag(result[n_vec]), lv_cimag(tmp))); | ||||
|                 } | ||||
|         } | ||||
| } | ||||
|  | ||||
| #endif /*LV_HAVE_GENERIC*/ | ||||
| @@ -71,99 +70,92 @@ static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_generic(lv_16sc_t* resu | ||||
| #include <emmintrin.h> | ||||
| static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_16sc_t** in_a, unsigned int num_points, int num_a_vectors) | ||||
| { | ||||
|  | ||||
| 	lv_16sc_t dotProduct=lv_cmake(0,0); | ||||
|     lv_16sc_t dotProduct = lv_cmake(0,0); | ||||
|  | ||||
|     const unsigned int sse_iters = num_points / 4; | ||||
|  | ||||
| 	const lv_16sc_t** _in_a = in_a; | ||||
| 	const lv_16sc_t* _in_common = in_common; | ||||
| 	lv_16sc_t* _out = out; | ||||
|     const lv_16sc_t** _in_a = in_a; | ||||
|     const lv_16sc_t* _in_common = in_common; | ||||
|     lv_16sc_t* _out = out; | ||||
|  | ||||
|     if (sse_iters>0) | ||||
|     if (sse_iters > 0) | ||||
|         { | ||||
|             __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; | ||||
|  | ||||
| 		__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; | ||||
|             //todo dyn mem reg | ||||
|  | ||||
| 		//todo dyn mem reg | ||||
|             __m128i* realcacc; | ||||
|             __m128i* imagcacc; | ||||
|  | ||||
| 		__m128i* realcacc; | ||||
| 		__m128i* imagcacc; | ||||
|             realcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0 | ||||
|             imagcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0 | ||||
|  | ||||
|     	realcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0 | ||||
|     	imagcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0 | ||||
|             __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; | ||||
|  | ||||
|             mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); | ||||
|             mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); | ||||
|  | ||||
| 		__m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; | ||||
|             for(unsigned int number = 0; number < sse_iters; number++) | ||||
|                 { | ||||
|                     //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] | ||||
|                     //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1] | ||||
|                     // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||
|  | ||||
|         mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); | ||||
|         mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); | ||||
|                     b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||
|                     for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||
|                         { | ||||
|                             a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||
|  | ||||
| 		for(unsigned int number = 0;number < sse_iters; number++) | ||||
| 			{ | ||||
| 	    	//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] | ||||
| 	    	//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1] | ||||
| 	    	// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] | ||||
|                             c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... | ||||
|  | ||||
|             	b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||
|             	for (int n_vec=0;n_vec<num_a_vectors;n_vec++) | ||||
|             	{ | ||||
|                             c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. | ||||
|                             real = _mm_subs_epi16 (c, c_sr); | ||||
|  | ||||
| 					a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg | ||||
|                             b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... | ||||
|                             a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... | ||||
|  | ||||
| 					c=_mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... | ||||
|                             imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... | ||||
|                             imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... | ||||
|  | ||||
| 					c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. | ||||
| 					real = _mm_subs_epi16 (c,c_sr); | ||||
|                             imag = _mm_adds_epi16(imag1, imag2); | ||||
|  | ||||
| 					b_sl = _mm_slli_si128(b,2); // b3.r, b2.i .... | ||||
| 					a_sl = _mm_slli_si128(a,2); // a3.r, a2.i .... | ||||
|                             realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real); | ||||
|                             imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag); | ||||
|  | ||||
| 					imag1 = _mm_mullo_epi16(a,b_sl); // a3.i*b3.r, .... | ||||
| 					imag2 = _mm_mullo_epi16(b,a_sl); // b3.i*a3.r, .... | ||||
|                         } | ||||
|                     _in_common += 4; | ||||
|                 } | ||||
|  | ||||
| 					imag = _mm_adds_epi16(imag1,imag2); | ||||
|             for (int n_vec=0;n_vec<num_a_vectors;n_vec++) | ||||
|                 { | ||||
|                     realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real); | ||||
|                     imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag); | ||||
|  | ||||
| 					realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real); | ||||
| 					imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag); | ||||
|                     result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]); | ||||
|  | ||||
|             	} | ||||
| 				_in_common += 4; | ||||
|  | ||||
| 			} | ||||
|  | ||||
| 			for (int n_vec=0;n_vec<num_a_vectors;n_vec++) | ||||
| 			{ | ||||
| 				realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real); | ||||
| 				imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag); | ||||
|  | ||||
| 				result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]); | ||||
|  | ||||
| 				_mm_storeu_si128((__m128i*)dotProductVector,result); // Store the results back into the dot product vector | ||||
| 				dotProduct=lv_cmake(0,0); | ||||
| 				for (int i = 0; i<4; ++i) | ||||
| 					{ | ||||
|  | ||||
| 				dotProduct=lv_cmake(sat_adds16b(lv_creal(dotProduct),lv_creal(dotProductVector[i])), | ||||
| 						sat_adds16b(lv_cimag(dotProduct),lv_cimag(dotProductVector[i]))); | ||||
| 					} | ||||
| 				_out[n_vec]=dotProduct; | ||||
| 			} | ||||
| 			free(realcacc); | ||||
| 			free(imagcacc); | ||||
|                     _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector | ||||
|                     dotProduct = lv_cmake(0,0); | ||||
|                     for (int i = 0; i<4; ++i) | ||||
|                         { | ||||
|                             dotProduct = lv_cmake(sat_adds16b(lv_creal(dotProduct), lv_creal(dotProductVector[i])), | ||||
|                                     sat_adds16b(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); | ||||
|                         } | ||||
|                     _out[n_vec] = dotProduct; | ||||
|                 } | ||||
|             free(realcacc); | ||||
|             free(imagcacc); | ||||
|         } | ||||
|  | ||||
| 	for (int n_vec=0;n_vec<num_a_vectors;n_vec++) | ||||
| 	{ | ||||
| 		for(unsigned int n  = sse_iters * 4;n < num_points; n++){ | ||||
|     for (int n_vec = 0; n_vec < num_a_vectors; n_vec++) | ||||
|         { | ||||
|             for(unsigned int n  = sse_iters * 4;n < num_points; n++){ | ||||
|  | ||||
| 				lv_16sc_t tmp=in_common[n]*in_a[n_vec][n]; | ||||
|                     lv_16sc_t tmp = in_common[n]*in_a[n_vec][n]; | ||||
|  | ||||
| 				_out[n_vec]=lv_cmake(sat_adds16b(lv_creal(_out[n_vec]),lv_creal(tmp)), | ||||
| 						sat_adds16b(lv_cimag(_out[n_vec]),lv_cimag(tmp))); | ||||
|  | ||||
| 			} | ||||
| 	} | ||||
|                     _out[n_vec] = lv_cmake(sat_adds16b(lv_creal(_out[n_vec]), lv_creal(tmp)), | ||||
|                             sat_adds16b(lv_cimag(_out[n_vec]), lv_cimag(tmp))); | ||||
|             } | ||||
|         } | ||||
|  | ||||
| } | ||||
| #endif /* LV_HAVE_SSE2 */ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Carles Fernandez
					Carles Fernandez