Moving two kernels to volk_gnsssdr. Still no testing

2025-11-03 08:43:04 +00:00 · 2016-01-13 19:38:07 +01:00
parent f88f222ef6
commit ae2b594c3b
8 changed files with 340 additions and 268 deletions
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
@@ -31,18 +31,13 @@
 *
 * -------------------------------------------------------------------------
 */
+
 #include "cpu_multicorrelator_16sc.h"
 #include <cmath>
 #include <iostream>
 #include <gnuradio/fxpt.h>  // fixed point sine and cosine
-
 #include "volk_gnsssdr/volk_gnsssdr.h"

-#define LV_HAVE_GENERIC
-#define LV_HAVE_SSE2
-
-#include "volk_gnsssdr_16ic_xn_resampler_16ic_xn.h"
-#include "volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h"

 bool cpu_multicorrelator_16sc::init(
        int max_signal_length_samples,
@@ -99,7 +94,7 @@ void cpu_multicorrelator_16sc::update_local_code(int correlator_length_samples,f
 		tmp_code_phases_chips[n] = d_shifts_chips[n] - rem_code_phase_chips;
 	}

-	volk_gnsssdr_16ic_xn_resampler_16ic_xn_sse2(d_local_codes_resampled,
+	volk_gnsssdr_16ic_xn_resampler_16ic_xn(d_local_codes_resampled,
 			d_local_code_in,
 			tmp_code_phases_chips,
 			code_phase_step_chips,
@@ -153,7 +148,7 @@ bool cpu_multicorrelator_16sc::Carrier_wipeoff_multicorrelator_resampler(
    //std::cout<<"d_sig_doppler_wiped 16sc="<<d_sig_doppler_wiped[23]<<std::endl;
    update_local_code(signal_length_samples, rem_code_phase_chips, code_phase_step_chips);

-    volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_a_sse2(d_corr_out, d_sig_doppler_wiped, (const lv_16sc_t**)d_local_codes_resampled, signal_length_samples, d_n_correlators);
+    volk_gnsssdr_16ic_x2_dot_prod_16ic_xn(d_corr_out, d_sig_doppler_wiped, (const lv_16sc_t**)d_local_codes_resampled, signal_length_samples, d_n_correlators);

    //for (int current_correlator_tap = 0; current_correlator_tap < d_n_correlators; current_correlator_tap++)
    //    {
--- a/src/algorithms/tracking/libs/volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h
+++ b/src/algorithms/tracking/libs/volk_gnsssdr_16ic_xn_dot_prod_16ic_xn.h
@@ -1,163 +0,0 @@
-/*!
- * \file volk_gnsssdr_16ic_x2_dot_prod_16ic.h
- * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
- * \authors <ul>
- *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
- *          </ul>
- *
- * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part 
- * and 8 bits the imaginary part) and accumulates them
- *
- * -------------------------------------------------------------------------
- *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
- *
- * GNSS-SDR is a software defined Global Navigation
- *          Satellite Systems receiver
- *
- * This file is part of GNSS-SDR.
- *
- * GNSS-SDR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * GNSS-SDR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
- *
- * -------------------------------------------------------------------------
- */
-
-#ifndef INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_u_H
-#define INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_u_H
-
-
-#include <volk_gnsssdr/volk_gnsssdr_complex.h>
-#include <volk_gnsssdr/saturated_arithmetic.h>
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
- \param cVector The vector where the accumulated result will be stored
- \param aVector One of the vectors to be multiplied and accumulated
- \param bVector One of the vectors to be multiplied and accumulated
- \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
- */
-static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_generic(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, unsigned int num_points, int num_a_vectors)
-{
-    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
-        {
-            result[n_vec] = lv_cmake(0,0);
-            for (unsigned int n = 0; n < num_points; n++)
-                {
-                    //r*a.r - i*a.i, i*a.r + r*a.i
-                    //result[n_vec]+=in_common[n]*in_a[n_vec][n];
-                    lv_16sc_t tmp = in_common[n]*in_a[n_vec][n];
-                    result[n_vec] = lv_cmake(sat_adds16b(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16b(lv_cimag(result[n_vec]), lv_cimag(tmp)));
-                }
-        }
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-static inline void volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_a_sse2(lv_16sc_t* out, const lv_16sc_t* in_common, const lv_16sc_t** in_a, unsigned int num_points, int num_a_vectors)
-{
-    lv_16sc_t dotProduct = lv_cmake(0,0);
-
-    const unsigned int sse_iters = num_points / 4;
-
-    const lv_16sc_t** _in_a = in_a;
-    const lv_16sc_t* _in_common = in_common;
-    lv_16sc_t* _out = out;
-
-    if (sse_iters > 0)
-        {
-            __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
-
-            //todo dyn mem reg
-
-            __m128i* realcacc;
-            __m128i* imagcacc;
-
-            realcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
-            imagcacc=(__m128i*)calloc(num_a_vectors,sizeof(__m128i)); //calloc also sets memory to 0
-
-            __m128i a,b,c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
-
-            mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-            mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255);
-
-            for(unsigned int number = 0; number < sse_iters; number++)
-                {
-                    //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
-                    //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
-                    // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
-
-                    b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-                    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
-                        {
-                            a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
-
-                            c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
-
-                            c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
-                            real = _mm_subs_epi16 (c, c_sr);
-
-                            b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
-                            a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
-
-                            imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
-                            imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
-
-                            imag = _mm_adds_epi16(imag1, imag2);
-
-                            realcacc[n_vec] = _mm_adds_epi16 (realcacc[n_vec], real);
-                            imagcacc[n_vec] = _mm_adds_epi16 (imagcacc[n_vec], imag);
-
-                        }
-                    _in_common += 4;
-                }
-
-            for (int n_vec=0;n_vec<num_a_vectors;n_vec++)
-                {
-                    realcacc[n_vec] = _mm_and_si128 (realcacc[n_vec], mask_real);
-                    imagcacc[n_vec] = _mm_and_si128 (imagcacc[n_vec], mask_imag);
-
-                    result = _mm_or_si128 (realcacc[n_vec], imagcacc[n_vec]);
-
-                    _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
-                    dotProduct = lv_cmake(0,0);
-                    for (int i = 0; i<4; ++i)
-                        {
-                            dotProduct = lv_cmake(sat_adds16b(lv_creal(dotProduct), lv_creal(dotProductVector[i])),
-                                    sat_adds16b(lv_cimag(dotProduct), lv_cimag(dotProductVector[i])));
-                        }
-                    _out[n_vec] = dotProduct;
-                }
-            free(realcacc);
-            free(imagcacc);
-        }
-
-    for (int n_vec = 0; n_vec < num_a_vectors; n_vec++)
-        {
-            for(unsigned int n  = sse_iters * 4;n < num_points; n++){
-
-                    lv_16sc_t tmp = in_common[n]*in_a[n_vec][n];
-
-                    _out[n_vec] = lv_cmake(sat_adds16b(lv_creal(_out[n_vec]), lv_creal(tmp)),
-                            sat_adds16b(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
-            }
-        }
-
-}
-#endif /* LV_HAVE_SSE2 */
-
-#endif /*INCLUDED_volk_gnsssdr_16ic_xn_dot_prod_16ic_xn_u_H*/
--- a/src/algorithms/tracking/libs/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h
+++ b/src/algorithms/tracking/libs/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h
@@ -1,172 +0,0 @@
-/*!
- * \file volk_gnsssdr_16ic_xn_resampler_16ic_xn.h
- * \brief Volk protokernel: resample a 16 bits complex vector
- * \authors <ul>
- *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
- *          </ul>
- *
- * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part 
- * and 8 bits the imaginary part) and accumulates them
- *
- * -------------------------------------------------------------------------
- *
- * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
- *
- * GNSS-SDR is a software defined Global Navigation
- *          Satellite Systems receiver
- *
- * This file is part of GNSS-SDR.
- *
- * GNSS-SDR is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * GNSS-SDR is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
- *
- * -------------------------------------------------------------------------
- */
-
-#ifndef INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_H
-#define INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_H
-
-#include <volk_gnsssdr/volk_gnsssdr_common.h>
-#include <volk_gnsssdr/volk_gnsssdr_complex.h>
-#include <cmath>
-//#pragma STDC FENV_ACCESS ON
-
-#ifdef LV_HAVE_GENERIC
-
-//int round_int( float r ) {
-//    return (r > 0.0) ? (r + 0.5) : (r - 0.5);
-//}
-/*!
- \brief Multiplies the two input complex vectors, point-by-point, storing the result in the third vector
- \param cVector The vector where the result will be stored
- \param aVector One of the vectors to be multiplied
- \param bVector One of the vectors to be multiplied
- \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
- */
-
-static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips, int num_out_vectors)
-{
-    int local_code_chip_index;
-    //fesetround(FE_TONEAREST);
-    for (int current_vector = 0; current_vector < num_out_vectors; current_vector++)
-    {
-		for (unsigned int n = 0; n < num_output_samples; n++)
-			{
-				// resample code for current tap
-				local_code_chip_index = round(code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips[current_vector]-0.5f);
-				if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips;
-				if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips;
-				//std::cout<<"g["<<n<<"]="<<code_phase_step_chips*static_cast<float>(n) + rem_code_phase_chips-0.5f<<","<<local_code_chip_index<<" ";
-				result[current_vector][n] = local_code[local_code_chip_index];
-			}
-    }
-    //std::cout<<std::endl;
-}
-
-#endif /*LV_HAVE_GENERIC*/
-
-
-#ifdef LV_HAVE_SSE2
-#include <emmintrin.h>
-static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int num_output_samples, unsigned int code_length_chips, int num_out_vectors)
-{
-
-	_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
-    unsigned int number;
-    const unsigned int quarterPoints = num_output_samples / 4;
-
-	lv_16sc_t** _result = result;
-    __attribute__((aligned(16))) int local_code_chip_index[4];
-    float tmp_rem_code_phase_chips;
-    __m128 _rem_code_phase,_code_phase_step_chips;
-    __m128i _code_length_chips,_code_length_chips_minus1;
-    __m128 _code_phase_out,_code_phase_out_with_offset;
-
-    _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
-    __attribute__((aligned(16))) int four_times_code_length_chips_minus1[4];
-    four_times_code_length_chips_minus1[0]=code_length_chips-1;
-    four_times_code_length_chips_minus1[1]=code_length_chips-1;
-    four_times_code_length_chips_minus1[2]=code_length_chips-1;
-    four_times_code_length_chips_minus1[3]=code_length_chips-1;
-
-    __attribute__((aligned(16))) int four_times_code_length_chips[4];
-    four_times_code_length_chips[0]=code_length_chips;
-    four_times_code_length_chips[1]=code_length_chips;
-    four_times_code_length_chips[2]=code_length_chips;
-    four_times_code_length_chips[3]=code_length_chips;
-
-    _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
-    _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
-
-    __m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over;
-
-    __m128i zero=_mm_setzero_si128();
-
-    __attribute__((aligned(16))) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
-    __m128 _4output_index=_mm_load_ps(init_idx_float);
-    __attribute__((aligned(16))) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
-    __m128 _4constant_float=_mm_load_ps(init_4constant_float);
-
-    int current_vector=0;
-    int sample_idx=0;
-    for(number=0;number < quarterPoints; number++){
-    	//common to all outputs
-    	_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
-
-    	//output vector dependant (different code phase offset)
-    	for(current_vector=0;current_vector<num_out_vectors;current_vector++)
-    	{
-    		tmp_rem_code_phase_chips=rem_code_phase_chips[current_vector]-0.5f; // adjust offset to perform correct rounding (chip transition at 0)
-    	    _rem_code_phase =  _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
-
-			_code_phase_out_with_offset = _mm_add_ps(_code_phase_out,_rem_code_phase); //add the phase offset
-			_code_phase_out_int=_mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
-
-			negative_indexes=_mm_cmplt_epi32 (_code_phase_out_int, zero); //test for negative values
-			_code_phase_out_int_neg=_mm_add_epi32(_code_phase_out_int,_code_length_chips); //the negative values branch
-			_code_phase_out_int_neg=_mm_xor_si128(_code_phase_out_int,_mm_and_si128( negative_indexes,_mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
-
-			overflow_indexes=_mm_cmpgt_epi32  (_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
-			_code_phase_out_int_over=_mm_sub_epi32(_code_phase_out_int_neg,_code_length_chips); //the negative values branch
-			_code_phase_out_int_over=_mm_xor_si128(_code_phase_out_int_neg,_mm_and_si128( overflow_indexes,_mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
-
-			_mm_storeu_si128((__m128i*)local_code_chip_index,_code_phase_out_int_over); // Store the results back
-
-			//todo: optimize the local code lookup table with intrinsics, if possible
-			_result[current_vector][sample_idx]=local_code[local_code_chip_index[0]];
-			_result[current_vector][sample_idx+1]=local_code[local_code_chip_index[1]];
-			_result[current_vector][sample_idx+2]=local_code[local_code_chip_index[2]];
-			_result[current_vector][sample_idx+3]=local_code[local_code_chip_index[3]];
-
-
-    	}
-    	_4output_index = _mm_add_ps(_4output_index,_4constant_float);
-    	sample_idx+=4;
-    }
-
-    for(number = quarterPoints * 4;number < num_output_samples; number++){
-
-    	for(current_vector=0;current_vector<num_out_vectors;current_vector++)
-    	{
-			local_code_chip_index[0]=static_cast<int>(code_phase_step_chips*static_cast<float>(number) + rem_code_phase_chips[current_vector]);
-			if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips-1;
-			if (local_code_chip_index[0]  > (code_length_chips-1)) local_code_chip_index[0] -= code_length_chips;
-			_result[current_vector][number]=local_code[local_code_chip_index[0]];
-    	}
-
-    }
-
-}
-#endif /* LV_HAVE_SSE2 */
-
-#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_H*/