mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-31 15:23:04 +00:00 
			
		
		
		
	New ultra-fast All-In-One Carrier wipe-off and Early-Prompt-Late correlator using Intel AVX SSE3 intrinsics.
Try it using the GPS_L1_CA_DLL_PLL_Optim_Tracking implementatioin for tracking operation! git-svn-id: https://svn.code.sf.net/p/gnss-sdr/code/trunk@283 64b25241-fba3-4117-9849-534c7e92360d
This commit is contained in:
		| @@ -45,7 +45,7 @@ project | |||||||
|  |  | ||||||
| project : requirements | project : requirements | ||||||
| <define>OMNITHREAD_POSIX | <define>OMNITHREAD_POSIX | ||||||
| <cxxflags>"-msse2 -mfpmath=sse -std=c++0x -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free" | <cxxflags>"-msse2 -msse3 -mfpmath=sse -std=c++0x -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free" | ||||||
| <linkflags>"-lgnuradio-blocks -lgnuradio-fft -lgnuradio-filter -larmadillo -lboost_system -lboost_filesystem -lboost_thread -lboost_date_time -llapack -lblas -lprofiler -ltcmalloc -lvolk" | <linkflags>"-lgnuradio-blocks -lgnuradio-fft -lgnuradio-filter -larmadillo -lboost_system -lboost_filesystem -lboost_thread -lboost_date_time -llapack -lblas -lprofiler -ltcmalloc -lvolk" | ||||||
| <include>src/algorithms/acquisition/adapters | <include>src/algorithms/acquisition/adapters | ||||||
| <include>src/algorithms/acquisition/gnuradio_blocks | <include>src/algorithms/acquisition/gnuradio_blocks | ||||||
|   | |||||||
| @@ -98,16 +98,52 @@ void fxp_nco(std::complex<float> *dest, int n_samples,float start_phase_rad, flo | |||||||
| 	for(int i = 0; i < n_samples; i++) | 	for(int i = 0; i < n_samples; i++) | ||||||
| 		{ | 		{ | ||||||
| 			//using temp variables | 			//using temp variables | ||||||
| 			gr_fxpt::sincos(phase_rad_i,&sin_f,&cos_f); | 			gr_fxpt::sincos(-phase_rad_i,&sin_f,&cos_f); | ||||||
| 			dest[i] = gr_complex(cos_f, -sin_f); | 			dest[i] = gr_complex(cos_f, sin_f); | ||||||
| 			//using references (may be it can be a problem for c++11 standard |  | ||||||
| 			//gr_fxpt::sincos(phase_rad_i,&d_carr_sign[i].imag(),&d_carr_sign[i].real()); |  | ||||||
|  |  | ||||||
| 			phase_rad_i += phase_step_rad_i; | 			phase_rad_i += phase_step_rad_i; | ||||||
|  |  | ||||||
| 		} | 		} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void fxp_nco_cpyref(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad) | ||||||
|  | { | ||||||
|  | 	int phase_rad_i; | ||||||
|  | 	phase_rad_i=gr_fxpt::float_to_fixed(start_phase_rad); | ||||||
|  | 	int phase_step_rad_i; | ||||||
|  | 	phase_step_rad_i=gr_fxpt::float_to_fixed(phase_step_rad); | ||||||
|  |  | ||||||
|  | 	float* vector_cpx; | ||||||
|  | 	vector_cpx=(float*)dest; | ||||||
|  | 	for(int i = 0; i < n_samples; i++) | ||||||
|  | 		{ | ||||||
|  | 			//using references (may be it can be a problem for c++11 standard | ||||||
|  | 			//gr_fxpt::sincos(phase_rad_i,&d_carr_sign[i].imag(),&d_carr_sign[i].real()); | ||||||
|  | 			gr_fxpt::sincos(-phase_rad_i,&vector_cpx[i*2+1],&vector_cpx[i*2]); | ||||||
|  | 			phase_rad_i += phase_step_rad_i; | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  | } | ||||||
|  |  | ||||||
|  | void fxp_nco_IQ_split(float* I, float* Q , int n_samples,float start_phase_rad, float phase_step_rad) | ||||||
|  | { | ||||||
|  | 	int phase_rad_i; | ||||||
|  | 	phase_rad_i=gr_fxpt::float_to_fixed(start_phase_rad); | ||||||
|  | 	int phase_step_rad_i; | ||||||
|  | 	phase_step_rad_i=gr_fxpt::float_to_fixed(phase_step_rad); | ||||||
|  |  | ||||||
|  | 	float sin_f,cos_f; | ||||||
|  | 	for(int i = 0; i < n_samples; i++) | ||||||
|  | 		{ | ||||||
|  | 			gr_fxpt::sincos(-phase_rad_i,&sin_f,&cos_f); | ||||||
|  | 			I[i]=cos_f; | ||||||
|  | 			Q[i]=sin_f; | ||||||
|  | 			phase_rad_i += phase_step_rad_i; | ||||||
|  |  | ||||||
|  | 		} | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| void std_nco(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad) | void std_nco(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad) | ||||||
| { | { | ||||||
| 	float phase_rad; | 	float phase_rad; | ||||||
|   | |||||||
| @@ -32,11 +32,7 @@ | |||||||
|  * ------------------------------------------------------------------------- |  * ------------------------------------------------------------------------- | ||||||
|  */ |  */ | ||||||
|  |  | ||||||
| /*! |  | ||||||
|  * \brief Implements a conjugate complex exponential vector in std::complex<float> *d_carr_sign |  | ||||||
|  * containing int n_samples, with the starting phase . |  | ||||||
|  * |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #ifndef GNSS_SDR_NCO_LIB_CC_H_ | #ifndef GNSS_SDR_NCO_LIB_CC_H_ | ||||||
| #define	GNSS_SDR_NCO_LIB_CC_H_ | #define	GNSS_SDR_NCO_LIB_CC_H_ | ||||||
| @@ -46,10 +42,48 @@ | |||||||
| #include <sse_mathfun.h> | #include <sse_mathfun.h> | ||||||
| #include <cmath> | #include <cmath> | ||||||
|  |  | ||||||
|  | /*! | ||||||
|  |  * \brief Implements a conjugate complex exponential vector in std::complex<float> *d_carr_sign | ||||||
|  |  * containing int n_samples, with the starting phase float start_phase_rad and the pase step between vector elements | ||||||
|  |  * float phase_step_rad. This function uses a SSE CORDIC implementation. | ||||||
|  |  * | ||||||
|  |  */ | ||||||
| void sse_nco(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad); | void sse_nco(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad); | ||||||
|  |  | ||||||
|  | /*! | ||||||
|  |  * \brief Implements a conjugate complex exponential vector in std::complex<float> *d_carr_sign | ||||||
|  |  * containing int n_samples, with the starting phase float start_phase_rad and the pase step between vector elements | ||||||
|  |  * float phase_step_rad. This function uses the GNU Radio fixed point CORDIC implementation. | ||||||
|  |  * | ||||||
|  |  */ | ||||||
| void fxp_nco(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad); | void fxp_nco(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad); | ||||||
|  |  | ||||||
|  | /*! | ||||||
|  |  * \brief Implements a conjugate complex exponential vector in std::complex<float> *d_carr_sign | ||||||
|  |  * containing int n_samples, with the starting phase float start_phase_rad and the pase step between vector elements | ||||||
|  |  * float phase_step_rad. This function uses the stdlib sin() and cos() implementation. | ||||||
|  |  * | ||||||
|  |  */ | ||||||
|  |  | ||||||
| void std_nco(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad); | void std_nco(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad); | ||||||
|  |  | ||||||
|  | /*! | ||||||
|  |  * \brief Implements a conjugate complex exponential vector in std::complex<float> *d_carr_sign | ||||||
|  |  * containing int n_samples, with the starting phase float start_phase_rad and the pase step between vector elements | ||||||
|  |  * float phase_step_rad. This function uses the GNU Radio fixed point CORDIC implementation. | ||||||
|  |  * | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | void fxp_nco_cpyref(std::complex<float> *dest, int n_samples,float start_phase_rad, float phase_step_rad); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /*! | ||||||
|  |  * \brief Implements a conjugate complex exponential vector in two separated float arrays (In-phase and Quadrature) | ||||||
|  |  * containing int n_samples, with the starting phase float start_phase_rad and the pase step between vector elements | ||||||
|  |  * float phase_step_rad. This function uses the GNU Radio fixed point CORDIC implementation. | ||||||
|  |  * | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | void fxp_nco_IQ_split(float* I, float* Q, int n_samples,float start_phase_rad, float phase_step_rad); | ||||||
|  |  | ||||||
| #endif //NCO_LIB_CC_H | #endif //NCO_LIB_CC_H | ||||||
|   | |||||||
| @@ -397,7 +397,7 @@ int Gps_L1_Ca_Dll_Pll_Optim_Tracking_cc::general_work (int noutput_items, gr_vec | |||||||
| 			update_local_carrier(); | 			update_local_carrier(); | ||||||
|  |  | ||||||
|             // perform Early, Prompt and Late correlation |             // perform Early, Prompt and Late correlation | ||||||
|             d_correlator.Carrier_wipeoff_and_EPL_volk(d_current_prn_length_samples, |             d_correlator.Carrier_wipeoff_and_EPL_volk_custom(d_current_prn_length_samples, | ||||||
|                     in, |                     in, | ||||||
|                     d_carr_sign, |                     d_carr_sign, | ||||||
|                     d_early_code, |                     d_early_code, | ||||||
|   | |||||||
| @@ -37,6 +37,9 @@ | |||||||
| #include <gnuradio/gr_block.h> | #include <gnuradio/gr_block.h> | ||||||
| #include "correlator.h" | #include "correlator.h" | ||||||
|  |  | ||||||
|  | #define LV_HAVE_SSE3 | ||||||
|  | #include "volk_cw_epl_corr.h" | ||||||
|  |  | ||||||
| unsigned long Correlator::next_power_2(unsigned long v) | unsigned long Correlator::next_power_2(unsigned long v) | ||||||
| { | { | ||||||
|     v--; |     v--; | ||||||
| @@ -109,6 +112,11 @@ void Correlator::Carrier_wipeoff_and_EPL_volk(int signal_length_samples,const gr | |||||||
|     //} |     //} | ||||||
| } | } | ||||||
|  |  | ||||||
|  | void Correlator::Carrier_wipeoff_and_EPL_volk_custom(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out, bool input_vector_unaligned) | ||||||
|  | { | ||||||
|  |     volk_cw_epl_corr_u(input, carrier, E_code, P_code, L_code, E_out, P_out, L_out, signal_length_samples); | ||||||
|  | } | ||||||
|  |  | ||||||
| void Correlator::Carrier_wipeoff_and_VEPL_volk(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* VE_code,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* VL_code,gr_complex* VE_out,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out,gr_complex* VL_out,bool input_vector_aligned) | void Correlator::Carrier_wipeoff_and_VEPL_volk(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* VE_code,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* VL_code,gr_complex* VE_out,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out,gr_complex* VL_out,bool input_vector_aligned) | ||||||
| { | { | ||||||
|     gr_complex* bb_signal; |     gr_complex* bb_signal; | ||||||
|   | |||||||
| @@ -54,6 +54,7 @@ class Correlator | |||||||
| public: | public: | ||||||
|     void Carrier_wipeoff_and_EPL_generic(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out); |     void Carrier_wipeoff_and_EPL_generic(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out); | ||||||
|     void Carrier_wipeoff_and_EPL_volk(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out,bool input_vector_aligned); |     void Carrier_wipeoff_and_EPL_volk(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out,bool input_vector_aligned); | ||||||
|  |     void Carrier_wipeoff_and_EPL_volk_custom(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out,bool input_vector_aligned); | ||||||
|     void Carrier_wipeoff_and_VEPL_volk(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* VE_code,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* VL_code,gr_complex* VE_out,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out,gr_complex* VL_out,bool input_vector_unaligned); |     void Carrier_wipeoff_and_VEPL_volk(int signal_length_samples,const gr_complex* input, gr_complex* carrier,gr_complex* VE_code,gr_complex* E_code, gr_complex* P_code, gr_complex* L_code,gr_complex* VL_code,gr_complex* VE_out,gr_complex* E_out, gr_complex* P_out, gr_complex* L_out,gr_complex* VL_out,bool input_vector_unaligned); | ||||||
|     Correlator(); |     Correlator(); | ||||||
|     ~Correlator(); |     ~Correlator(); | ||||||
|   | |||||||
							
								
								
									
										203
									
								
								src/algorithms/tracking/libs/volk_cw_epl_corr.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										203
									
								
								src/algorithms/tracking/libs/volk_cw_epl_corr.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,203 @@ | |||||||
|  | /*! | ||||||
|  |  * \file volk_cw_epl_corr.h | ||||||
|  |  * \brief Implements the carrier wipeoff function and the Early Prompt Late correlators in a single SSE-enabled loop. | ||||||
|  |  * | ||||||
|  |  * \author Javier Arribas 2012, jarribas(at)cttc.es | ||||||
|  |  * | ||||||
|  |  * Detailed description of the file here if needed. | ||||||
|  |  * | ||||||
|  |  * ------------------------------------------------------------------------- | ||||||
|  |  * | ||||||
|  |  * Copyright (C) 2010-2012  (see AUTHORS file for a list of contributors) | ||||||
|  |  * | ||||||
|  |  * GNSS-SDR is a software defined Global Navigation | ||||||
|  |  *          Satellite Systems receiver | ||||||
|  |  * | ||||||
|  |  * This file is part of GNSS-SDR. | ||||||
|  |  * | ||||||
|  |  * GNSS-SDR is free software: you can redistribute it and/or modify | ||||||
|  |  * it under the terms of the GNU General Public License as published by | ||||||
|  |  * the Free Software Foundation, either version 3 of the License, or | ||||||
|  |  * at your option) any later version. | ||||||
|  |  * | ||||||
|  |  * GNSS-SDR is distributed in the hope that it will be useful, | ||||||
|  |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||||
|  |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | ||||||
|  |  * GNU General Public License for more details. | ||||||
|  |  * | ||||||
|  |  * You should have received a copy of the GNU General Public License | ||||||
|  |  * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>. | ||||||
|  |  * | ||||||
|  |  * ------------------------------------------------------------------------- | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | #ifndef INCLUDED_volk_cw_epl_corr_H | ||||||
|  | #define INCLUDED_volk_cw_epl_corr_H | ||||||
|  |  | ||||||
|  | #include <inttypes.h> | ||||||
|  | #include <stdio.h> | ||||||
|  | #include <volk/volk_complex.h> | ||||||
|  | #include <float.h> | ||||||
|  | #include <string.h> | ||||||
|  |  | ||||||
|  | /*! | ||||||
|  |  * TODO: Code the SSE4 version and benchmark it | ||||||
|  |  */ | ||||||
|  | #ifdef LV_HAVE_SSE3 | ||||||
|  | #include <pmmintrin.h> | ||||||
|  |   /*! | ||||||
|  |     \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation | ||||||
|  |     \param input The input signal input | ||||||
|  |     \param carrier The carrier signal input | ||||||
|  |     \param E_code Early PRN code replica input | ||||||
|  |     \param P_code Early PRN code replica input | ||||||
|  |     \param L_code Early PRN code replica input | ||||||
|  |     \param E_out Early correlation output | ||||||
|  |     \param P_out Early correlation output | ||||||
|  |     \param L_out Early correlation output | ||||||
|  |     \param num_points The number of complex values in vectors | ||||||
|  |   */ | ||||||
|  |  | ||||||
|  | static inline void volk_cw_epl_corr_u(const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out,  unsigned int num_points){ | ||||||
|  | 	unsigned int number = 0; | ||||||
|  |     const unsigned int halfPoints = num_points / 2; | ||||||
|  |  | ||||||
|  |     lv_32fc_t dotProduct_E; | ||||||
|  |     memset(&dotProduct_E, 0x0, 2*sizeof(float)); | ||||||
|  |     lv_32fc_t dotProduct_P; | ||||||
|  |     memset(&dotProduct_P, 0x0, 2*sizeof(float)); | ||||||
|  |     lv_32fc_t dotProduct_L; | ||||||
|  |     memset(&dotProduct_L, 0x0, 2*sizeof(float)); | ||||||
|  |  | ||||||
|  |     // Aux vars | ||||||
|  |     __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; | ||||||
|  |  | ||||||
|  |     z_E=_mm_setzero_ps(); | ||||||
|  |     z_P=_mm_setzero_ps(); | ||||||
|  |     z_L=_mm_setzero_ps(); | ||||||
|  |  | ||||||
|  |     //input and output vectors | ||||||
|  |     //lv_32fc_t* _input_BB = input_BB; | ||||||
|  |     const lv_32fc_t* _input = input; | ||||||
|  |     const lv_32fc_t* _carrier = carrier; | ||||||
|  |     const lv_32fc_t* _E_code = E_code; | ||||||
|  |     const lv_32fc_t* _P_code = P_code; | ||||||
|  |     const lv_32fc_t* _L_code = L_code; | ||||||
|  |  | ||||||
|  |     for(;number < halfPoints; number++){ | ||||||
|  |  | ||||||
|  |       // carrier wipe-off (vector point-to-point product) | ||||||
|  |       x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi | ||||||
|  |       y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di | ||||||
|  |  | ||||||
|  |       yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr | ||||||
|  |       yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di | ||||||
|  |  | ||||||
|  |       tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
|  |  | ||||||
|  |       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br | ||||||
|  |  | ||||||
|  |       tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|  |  | ||||||
|  |       z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|  |       //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container | ||||||
|  |  | ||||||
|  |       // correlation E,P,L (3x vector scalar product) | ||||||
|  |       // Early | ||||||
|  |       //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi | ||||||
|  |       x=z; | ||||||
|  |  | ||||||
|  |       y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di | ||||||
|  |  | ||||||
|  |       yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr | ||||||
|  |       yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di | ||||||
|  |  | ||||||
|  |       tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
|  |  | ||||||
|  |       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br | ||||||
|  |  | ||||||
|  |       tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|  |  | ||||||
|  |       z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|  |       z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together | ||||||
|  |  | ||||||
|  |       // Prompt | ||||||
|  |       //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi | ||||||
|  |       y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di | ||||||
|  |  | ||||||
|  |       yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr | ||||||
|  |       yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di | ||||||
|  |  | ||||||
|  |       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br | ||||||
|  |  | ||||||
|  |       tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
|  |  | ||||||
|  |       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br | ||||||
|  |  | ||||||
|  |       tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|  |  | ||||||
|  |       z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|  |       z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together | ||||||
|  |  | ||||||
|  |       // Late | ||||||
|  |       //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi | ||||||
|  |       y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di | ||||||
|  |  | ||||||
|  |       yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr | ||||||
|  |       yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di | ||||||
|  |  | ||||||
|  |       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br | ||||||
|  |  | ||||||
|  |       tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr | ||||||
|  |  | ||||||
|  |       x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br | ||||||
|  |  | ||||||
|  |       tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di | ||||||
|  |  | ||||||
|  |       z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di | ||||||
|  |  | ||||||
|  |       z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together | ||||||
|  |  | ||||||
|  |       /*pointer increment*/ | ||||||
|  |       _carrier += 2; | ||||||
|  |       _input += 2; | ||||||
|  |       //_input_BB += 2; | ||||||
|  |       _E_code += 2; | ||||||
|  |       _P_code += 2; | ||||||
|  |       _L_code +=2; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; | ||||||
|  |     __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; | ||||||
|  |     __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; | ||||||
|  |     //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; | ||||||
|  |  | ||||||
|  |     _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector | ||||||
|  |     _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector | ||||||
|  |     _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector | ||||||
|  |  | ||||||
|  |     dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); | ||||||
|  |     dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); | ||||||
|  |     dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); | ||||||
|  |  | ||||||
|  |     if((num_points % 2) != 0) { | ||||||
|  |       //_input_BB = (*_input) * (*_carrier); | ||||||
|  |   	  dotProduct_E += (*_input) * (*_E_code)*(*_carrier); | ||||||
|  |   	  dotProduct_P += (*_input) * (*_P_code)*(*_carrier); | ||||||
|  |   	  dotProduct_L += (*_input) * (*_L_code)*(*_carrier); | ||||||
|  |  | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     *E_out = dotProduct_E; | ||||||
|  |     *P_out = dotProduct_P; | ||||||
|  |     *L_out = dotProduct_L; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #endif /* LV_HAVE_SSE */ | ||||||
|  |  | ||||||
|  | #endif /* INCLUDED_volk_cw_epl_corr_H */ | ||||||
| @@ -49,6 +49,8 @@ TEST(Cordic_Test, StandardCIsFasterThanCordic) | |||||||
|     cordicPtr = new Cordic(largest_k); |     cordicPtr = new Cordic(largest_k); | ||||||
|  |  | ||||||
|     std::complex<float> *d_carr_sign; |     std::complex<float> *d_carr_sign; | ||||||
|  |     float* d_carr_sign_I; | ||||||
|  |     float* d_carr_sign_Q; | ||||||
|     // carrier parameters |     // carrier parameters | ||||||
|     int d_vector_length=4000; |     int d_vector_length=4000; | ||||||
|     float phase_rad; |     float phase_rad; | ||||||
| @@ -59,6 +61,10 @@ TEST(Cordic_Test, StandardCIsFasterThanCordic) | |||||||
|  |  | ||||||
|     // space for carrier wipeoff and signal baseband vectors |     // space for carrier wipeoff and signal baseband vectors | ||||||
|     if (posix_memalign((void**)&d_carr_sign, 16, d_vector_length * sizeof(std::complex<float>) * 2) == 0){}; |     if (posix_memalign((void**)&d_carr_sign, 16, d_vector_length * sizeof(std::complex<float>) * 2) == 0){}; | ||||||
|  |  | ||||||
|  |     if (posix_memalign((void**)&d_carr_sign_I, 16, d_vector_length * sizeof(float) * 2) == 0){}; | ||||||
|  |     if (posix_memalign((void**)&d_carr_sign_Q, 16, d_vector_length * sizeof(float) * 2) == 0){}; | ||||||
|  |  | ||||||
|     double sin_d,cos_d; |     double sin_d,cos_d; | ||||||
|     double sin_f,cos_f; |     double sin_f,cos_f; | ||||||
|  |  | ||||||
| @@ -123,6 +129,27 @@ TEST(Cordic_Test, StandardCIsFasterThanCordic) | |||||||
|     long long int end4 = tv.tv_sec *1000000 + tv.tv_usec; |     long long int end4 = tv.tv_sec *1000000 + tv.tv_usec; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     //*** GNU RADIO FIXED POINT ARITHMETIC COPY BY REFERENCE******** | ||||||
|  |     gettimeofday(&tv, NULL); | ||||||
|  |     long long int begin5 = tv.tv_sec * 1000000 + tv.tv_usec; | ||||||
|  |     for(int i=0; i<niter; i++) | ||||||
|  |         { | ||||||
|  |         	fxp_nco_cpyref(d_carr_sign, d_vector_length,0, phase_step_rad); | ||||||
|  |         } | ||||||
|  |     gettimeofday(&tv, NULL); | ||||||
|  |     long long int end5 = tv.tv_sec *1000000 + tv.tv_usec; | ||||||
|  |  | ||||||
|  |  | ||||||
|  |     //*** GNU RADIO FIXED POINT ARITHMETIC COPY BY REFERENCE SPLIT IQ******** | ||||||
|  |     gettimeofday(&tv, NULL); | ||||||
|  |     long long int begin6 = tv.tv_sec * 1000000 + tv.tv_usec; | ||||||
|  |     for(int i=0; i<niter; i++) | ||||||
|  |         { | ||||||
|  |         	fxp_nco_IQ_split(d_carr_sign_I, d_carr_sign_Q, d_vector_length,0, phase_step_rad); | ||||||
|  |         } | ||||||
|  |     gettimeofday(&tv, NULL); | ||||||
|  |     long long int end6 = tv.tv_sec *1000000 + tv.tv_usec; | ||||||
|  |  | ||||||
|     delete cordicPtr; |     delete cordicPtr; | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -130,6 +157,7 @@ TEST(Cordic_Test, StandardCIsFasterThanCordic) | |||||||
|     std::cout << "STD LIB ARITHM computed " << niter << " times in " << (end2-begin2) << " microseconds" << std::endl; |     std::cout << "STD LIB ARITHM computed " << niter << " times in " << (end2-begin2) << " microseconds" << std::endl; | ||||||
|     std::cout << "FXPT CORDIC computed " << niter << " times in " << (end3-begin3) << " microseconds" << std::endl; |     std::cout << "FXPT CORDIC computed " << niter << " times in " << (end3-begin3) << " microseconds" << std::endl; | ||||||
|     std::cout << "SSE CORDIC computed " << niter << " times in " << (end4-begin4) << " microseconds" << std::endl; |     std::cout << "SSE CORDIC computed " << niter << " times in " << (end4-begin4) << " microseconds" << std::endl; | ||||||
|  |     std::cout << "FXPT CORDIC CPY REF computed " << niter << " times in " << (end5-begin5) << " microseconds" << std::endl; | ||||||
|  |     std::cout << "FXPT CORDIC CPY REF SPLIT computed " << niter << " times in " << (end6-begin6) << " microseconds" << std::endl; | ||||||
|     EXPECT_TRUE((end2-begin2) < (end1-begin1)); // if true, standard C++ is faster than the cordic implementation |     EXPECT_TRUE((end2-begin2) < (end1-begin1)); // if true, standard C++ is faster than the cordic implementation | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Javier Arribas
					Javier Arribas