mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-31 15:23:04 +00:00 
			
		
		
		
	Migrating cuda tracking internal DLL/PLL vars from float to double
This commit is contained in:
		| @@ -17,10 +17,10 @@ ControlThread.wait_for_flowgraph=false | ||||
| SignalSource.implementation=File_Signal_Source | ||||
|  | ||||
| ;#filename: path to file with the captured GNSS signal samples to be processed | ||||
| SignalSource.filename=/media/javier/SISTEMA/signals/New York/4msps.dat | ||||
| SignalSource.filename=/home/javier/ClionProjects/gnss-sim/build/signal_out.bin | ||||
|  | ||||
| ;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version. | ||||
| SignalSource.item_type=gr_complex | ||||
| SignalSource.item_type=byte | ||||
|  | ||||
| ;#sampling_frequency: Original Signal sampling frequency in [Hz]  | ||||
| SignalSource.sampling_frequency=4000000 | ||||
| @@ -28,12 +28,6 @@ SignalSource.sampling_frequency=4000000 | ||||
| ;#freq: RF front-end center frequency in [Hz]  | ||||
| SignalSource.freq=1575420000 | ||||
|  | ||||
| ;#gain: Front-end Gain in [dB]  | ||||
| SignalSource.gain=60 | ||||
|  | ||||
| ;#subdevice: UHD subdevice specification (for USRP1 use A:0 or B:0) | ||||
| SignalSource.subdevice=B:0 | ||||
|  | ||||
| ;#samples: Number of samples to be processed. Notice that 0 indicates the entire file. | ||||
| SignalSource.samples=0 | ||||
|  | ||||
| @@ -58,12 +52,12 @@ SignalSource.enable_throttle_control=false | ||||
| ;#[Pass_Through] disables this block and the [DataTypeAdapter], [InputFilter] and [Resampler] blocks | ||||
| ;#[Signal_Conditioner] enables this block. Then you have to configure [DataTypeAdapter], [InputFilter] and [Resampler] blocks | ||||
| ;SignalConditioner.implementation=Signal_Conditioner | ||||
| SignalConditioner.implementation=Pass_Through | ||||
| SignalConditioner.implementation=Signal_Conditioner | ||||
|  | ||||
| ;######### DATA_TYPE_ADAPTER CONFIG ############ | ||||
| ;## Changes the type of input data. Please disable it in this version. | ||||
| ;#implementation: [Pass_Through] disables this block | ||||
| DataTypeAdapter.implementation=Pass_Through | ||||
| DataTypeAdapter.implementation=Ibyte_To_Complex | ||||
|  | ||||
| ;######### INPUT_FILTER CONFIG ############ | ||||
| ;## Filter the input data. Can be combined with frequency translation for IF signals | ||||
| @@ -210,13 +204,13 @@ Acquisition_GPS.sampled_ms=1 | ||||
| ;#implementation: Acquisition algorithm selection for this channel: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition] | ||||
| Acquisition_GPS.implementation=GPS_L1_CA_PCPS_Acquisition | ||||
| ;#threshold: Acquisition threshold | ||||
| Acquisition_GPS.threshold=0.005 | ||||
| Acquisition_GPS.threshold=0.06 | ||||
| ;#pfa: Acquisition false alarm probability. This option overrides the threshold option. Only use with implementations: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition]  | ||||
| ;Acquisition_GPS.pfa=0.01 | ||||
| ;#doppler_max: Maximum expected Doppler shift [Hz] | ||||
| Acquisition_GPS.doppler_max=10000 | ||||
| Acquisition_GPS.doppler_max=6000 | ||||
| ;#doppler_max: Doppler step in the grid search [Hz] | ||||
| Acquisition_GPS.doppler_step=500 | ||||
| Acquisition_GPS.doppler_step=100 | ||||
|  | ||||
| ;######### TRACKING GLOBAL CONFIG ############ | ||||
|  | ||||
| @@ -235,7 +229,7 @@ Tracking_GPS.dump=true | ||||
| Tracking_GPS.dump_filename=../data/epl_tracking_ch_ | ||||
|  | ||||
| ;#pll_bw_hz: PLL loop filter bandwidth [Hz] | ||||
| Tracking_GPS.pll_bw_hz=55.0; | ||||
| Tracking_GPS.pll_bw_hz=15.0; | ||||
|  | ||||
| ;#dll_bw_hz: DLL loop filter bandwidth [Hz] | ||||
| Tracking_GPS.dll_bw_hz=1.5 | ||||
|   | ||||
| @@ -233,7 +233,7 @@ Acquisition_1B.doppler_step=125 | ||||
| ;######### TRACKING GPS CONFIG ############ | ||||
|  | ||||
| ;#implementation: Selected tracking algorithm: [GPS_L1_CA_DLL_PLL_Tracking] or [GPS_L1_CA_DLL_FLL_PLL_Tracking] or [GPS_L1_CA_TCP_CONNECTOR_Tracking] or [Galileo_E1_DLL_PLL_VEML_Tracking] | ||||
| Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_Artemisa_Tracking | ||||
| Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_Tracking | ||||
| ;#item_type: Type and resolution for each of the signal samples. Use only [gr_complex] in this version. | ||||
| Tracking_1C.item_type=gr_complex | ||||
|  | ||||
|   | ||||
| @@ -195,30 +195,30 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking() | ||||
|     d_acq_sample_stamp =  d_acquisition_gnss_synchro->Acq_samplestamp_samples; | ||||
|  | ||||
|     long int acq_trk_diff_samples; | ||||
|     float acq_trk_diff_seconds; | ||||
|     double acq_trk_diff_seconds; | ||||
|     acq_trk_diff_samples = static_cast<long int>(d_sample_counter) - static_cast<long int>(d_acq_sample_stamp);//-d_vector_length; | ||||
|     DLOG(INFO) << "Number of samples between Acquisition and Tracking =" << acq_trk_diff_samples; | ||||
|     acq_trk_diff_seconds = static_cast<float>(acq_trk_diff_samples) / static_cast<float>(d_fs_in); | ||||
|     acq_trk_diff_seconds = static_cast<double>(acq_trk_diff_samples) / static_cast<double>(d_fs_in); | ||||
|     //doppler effect | ||||
|     // Fd=(C/(C+Vr))*F | ||||
|     float radial_velocity = (GPS_L1_FREQ_HZ + d_acq_carrier_doppler_hz) / GPS_L1_FREQ_HZ; | ||||
|     double radial_velocity = (GPS_L1_FREQ_HZ + d_acq_carrier_doppler_hz) / GPS_L1_FREQ_HZ; | ||||
|     // new chip and prn sequence periods based on acq Doppler | ||||
|     float T_chip_mod_seconds; | ||||
|     float T_prn_mod_seconds; | ||||
|     float T_prn_mod_samples; | ||||
|     double T_chip_mod_seconds; | ||||
|     double T_prn_mod_seconds; | ||||
|     double T_prn_mod_samples; | ||||
|     d_code_freq_chips = radial_velocity * GPS_L1_CA_CODE_RATE_HZ; | ||||
|     T_chip_mod_seconds = 1/d_code_freq_chips; | ||||
|     T_chip_mod_seconds = 1.0/d_code_freq_chips; | ||||
|     T_prn_mod_seconds = T_chip_mod_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS; | ||||
|     T_prn_mod_samples = T_prn_mod_seconds * static_cast<float>(d_fs_in); | ||||
|     T_prn_mod_samples = T_prn_mod_seconds * static_cast<double>(d_fs_in); | ||||
|  | ||||
|     d_current_prn_length_samples = round(T_prn_mod_samples); | ||||
|  | ||||
|     float T_prn_true_seconds = GPS_L1_CA_CODE_LENGTH_CHIPS / GPS_L1_CA_CODE_RATE_HZ; | ||||
|     float T_prn_true_samples = T_prn_true_seconds * static_cast<float>(d_fs_in); | ||||
|     float T_prn_diff_seconds=  T_prn_true_seconds - T_prn_mod_seconds; | ||||
|     float N_prn_diff = acq_trk_diff_seconds / T_prn_true_seconds; | ||||
|     float corrected_acq_phase_samples, delay_correction_samples; | ||||
|     corrected_acq_phase_samples = fmod((d_acq_code_phase_samples + T_prn_diff_seconds * N_prn_diff * static_cast<float>(d_fs_in)), T_prn_true_samples); | ||||
|     double T_prn_true_seconds = GPS_L1_CA_CODE_LENGTH_CHIPS / GPS_L1_CA_CODE_RATE_HZ; | ||||
|     double T_prn_true_samples = T_prn_true_seconds * static_cast<double>(d_fs_in); | ||||
|     double T_prn_diff_seconds=  T_prn_true_seconds - T_prn_mod_seconds; | ||||
|     double N_prn_diff = acq_trk_diff_seconds / T_prn_true_seconds; | ||||
|     double corrected_acq_phase_samples, delay_correction_samples; | ||||
|     corrected_acq_phase_samples = fmod((d_acq_code_phase_samples + T_prn_diff_seconds * N_prn_diff * static_cast<double>(d_fs_in)), T_prn_true_samples); | ||||
|     if (corrected_acq_phase_samples < 0) | ||||
|         { | ||||
|             corrected_acq_phase_samples = T_prn_mod_samples + corrected_acq_phase_samples; | ||||
| @@ -286,10 +286,10 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|         gr_vector_const_void_star &input_items, gr_vector_void_star &output_items) | ||||
| { | ||||
|     // process vars | ||||
|     float carr_error_hz=0.0; | ||||
|     float carr_error_filt_hz=0.0; | ||||
|     float code_error_chips=0.0; | ||||
|     float code_error_filt_chips=0.0; | ||||
| 	double carr_error_hz=0.0; | ||||
| 	double carr_error_filt_hz=0.0; | ||||
| 	double code_error_chips=0.0; | ||||
| 	double code_error_filt_chips=0.0; | ||||
|  | ||||
|     // Block input data and block output stream pointers | ||||
|     const gr_complex* in = (gr_complex*) input_items[0]; | ||||
| @@ -320,20 +320,24 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|             current_synchro_data = *d_acquisition_gnss_synchro; | ||||
|  | ||||
|             // UPDATE NCO COMMAND | ||||
|             float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in); | ||||
|             double phase_step_rad = GPS_TWO_PI * d_carrier_doppler_hz / static_cast<double>(d_fs_in); | ||||
|  | ||||
|         	//code resampler on GPU (new) | ||||
|             float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in); | ||||
|             float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in); | ||||
|             double code_phase_step_chips = d_code_freq_chips / static_cast<double>(d_fs_in); | ||||
|             double rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in); | ||||
|  | ||||
|             memcpy(in_gpu, in, sizeof(gr_complex) * d_current_prn_length_samples); | ||||
|             cudaProfilerStart(); | ||||
|             multicorrelator_gpu->Carrier_wipeoff_multicorrelator_resampler_cuda(d_rem_carr_phase_rad, phase_step_rad, code_phase_step_chips, rem_code_phase_chips, d_current_prn_length_samples, 3); | ||||
|             multicorrelator_gpu->Carrier_wipeoff_multicorrelator_resampler_cuda( static_cast<float>(d_rem_carr_phase_rad), | ||||
|             		static_cast<float>(phase_step_rad), | ||||
|             		static_cast<float>(code_phase_step_chips), | ||||
|             		static_cast<float>(rem_code_phase_chips), | ||||
|             		d_current_prn_length_samples, 3); | ||||
|             cudaProfilerStop(); | ||||
|  | ||||
|             // ################## PLL ########################################################## | ||||
|             // PLL discriminator | ||||
|             carr_error_hz = pll_cloop_two_quadrant_atan(*d_Prompt) / static_cast<float>(GPS_TWO_PI); | ||||
|             carr_error_hz = pll_cloop_two_quadrant_atan(*d_Prompt) / GPS_TWO_PI; | ||||
|             // Carrier discriminator filter | ||||
|             carr_error_filt_hz = d_carrier_loop_filter.get_carrier_nco(carr_error_hz); | ||||
|             // New carrier Doppler frequency estimation | ||||
| @@ -352,7 +356,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|             // Code discriminator filter | ||||
|             code_error_filt_chips = d_code_loop_filter.get_code_nco(code_error_chips); //[chips/second] | ||||
|             //Code phase accumulator | ||||
|             float code_error_filt_secs; | ||||
|             double code_error_filt_secs; | ||||
|             code_error_filt_secs = (GPS_L1_CA_CODE_PERIOD * code_error_filt_chips) / GPS_L1_CA_CODE_RATE_HZ; //[seconds] | ||||
|             d_acc_code_phase_secs = d_acc_code_phase_secs + code_error_filt_secs; | ||||
|  | ||||
| @@ -363,10 +367,10 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|             double T_prn_samples; | ||||
|             double K_blk_samples; | ||||
|             // Compute the next buffer length based in the new period of the PRN sequence and the code phase error estimation | ||||
|             T_chip_seconds = 1 / static_cast<double>(d_code_freq_chips); | ||||
|             T_chip_seconds = 1.0 / d_code_freq_chips; | ||||
|             T_prn_seconds = T_chip_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS; | ||||
|             T_prn_samples = T_prn_seconds * static_cast<double>(d_fs_in); | ||||
|             K_blk_samples = T_prn_samples + d_rem_code_phase_samples + static_cast<double>(code_error_filt_secs) * static_cast<double>(d_fs_in); | ||||
|             K_blk_samples = T_prn_samples + d_rem_code_phase_samples + code_error_filt_secs * static_cast<double>(d_fs_in); | ||||
|             //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample | ||||
|  | ||||
|             // ####### CN0 ESTIMATION AND LOCK DETECTORS ###### | ||||
| @@ -415,16 +419,16 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|             //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter + (double)d_current_prn_length_samples + (double)d_rem_code_phase_samples)/static_cast<double>(d_fs_in); | ||||
|  | ||||
|             // Tracking_timestamp_secs is aligned with the CURRENT PRN start sample (Hybridization OK!, but some glitches??) | ||||
|             current_synchro_data.Tracking_timestamp_secs = (static_cast<double>(d_sample_counter) + static_cast<double>(d_rem_code_phase_samples)) / static_cast<double>(d_fs_in); | ||||
|             current_synchro_data.Tracking_timestamp_secs = (static_cast<double>(d_sample_counter) + d_rem_code_phase_samples) / static_cast<double>(d_fs_in); | ||||
|             //compute remnant code phase samples AFTER the Tracking timestamp | ||||
|             d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample | ||||
|  | ||||
|             //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter)/static_cast<double>(d_fs_in); | ||||
|             // This tracking block aligns the Tracking_timestamp_secs with the start sample of the PRN, thus, Code_phase_secs=0 | ||||
|             current_synchro_data.Code_phase_secs = 0; | ||||
|             current_synchro_data.Carrier_phase_rads = static_cast<double>(d_acc_carrier_phase_rad); | ||||
|             current_synchro_data.Carrier_Doppler_hz = static_cast<double>(d_carrier_doppler_hz); | ||||
|             current_synchro_data.CN0_dB_hz = static_cast<double>(d_CN0_SNV_dB_Hz); | ||||
|             current_synchro_data.Carrier_phase_rads = d_acc_carrier_phase_rad; | ||||
|             current_synchro_data.Carrier_Doppler_hz = d_carrier_doppler_hz; | ||||
|             current_synchro_data.CN0_dB_hz = d_CN0_SNV_dB_Hz; | ||||
|             current_synchro_data.Flag_valid_pseudorange = false; | ||||
|             *out[0] = current_synchro_data; | ||||
|  | ||||
| @@ -497,41 +501,50 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|             tmp_L = std::abs<float>(*d_Late); | ||||
|             try | ||||
|             { | ||||
|  | ||||
|                 // EPR | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&tmp_E), sizeof(float)); | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&tmp_P), sizeof(float)); | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&tmp_L), sizeof(float)); | ||||
|                 d_dump_file.write((char*)&tmp_E, sizeof(float)); | ||||
|                 d_dump_file.write((char*)&tmp_P, sizeof(float)); | ||||
|                 d_dump_file.write((char*)&tmp_L, sizeof(float)); | ||||
|                 // PROMPT I and Q (to analyze navigation symbols) | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&prompt_I), sizeof(float)); | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&prompt_Q), sizeof(float)); | ||||
|                 d_dump_file.write((char*)&prompt_I, sizeof(float)); | ||||
|                 d_dump_file.write((char*)&prompt_Q, sizeof(float)); | ||||
|                 // PRN start sample stamp | ||||
|                 //tmp_float=(float)d_sample_counter; | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&d_sample_counter), sizeof(unsigned long int)); | ||||
|                 d_dump_file.write((char*)&d_sample_counter, sizeof(unsigned long int)); | ||||
|                 // accumulated carrier phase | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&d_acc_carrier_phase_rad), sizeof(float)); | ||||
|                 tmp_float = d_acc_carrier_phase_rad; | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|  | ||||
|                 // carrier and code frequency | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(float)); | ||||
|                     tmp_float=d_code_freq_chips; | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float)); | ||||
|                 tmp_float = d_carrier_doppler_hz; | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|                 tmp_float = d_code_freq_chips; | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|  | ||||
|                 //PLL commands | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&carr_error_hz), sizeof(float)); | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&carr_error_filt_hz), sizeof(float)); | ||||
|                 tmp_float = carr_error_hz; | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|                 tmp_float = carr_error_filt_hz; | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|  | ||||
|                 //DLL commands | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&code_error_chips), sizeof(float)); | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&code_error_filt_chips), sizeof(float)); | ||||
|                 tmp_float = code_error_chips; | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|                 tmp_float = code_error_filt_chips; | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|  | ||||
|                 // CN0 and carrier lock test | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&d_CN0_SNV_dB_Hz), sizeof(float)); | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&d_carrier_lock_test), sizeof(float)); | ||||
|                 tmp_float = d_CN0_SNV_dB_Hz; | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|                 tmp_float = d_carrier_lock_test; | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|  | ||||
|                 // AUX vars (for debug purposes) | ||||
|                 tmp_float = d_rem_code_phase_samples; | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float)); | ||||
|                     tmp_double = static_cast<double>(d_sample_counter + d_current_prn_length_samples); | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&tmp_double), sizeof(double)); | ||||
|                 d_dump_file.write((char*)&tmp_float, sizeof(float)); | ||||
|                 tmp_double = (double)(d_sample_counter + d_current_prn_length_samples); | ||||
|                 d_dump_file.write((char*)&tmp_double, sizeof(double)); | ||||
|             } | ||||
|             catch (std::ifstream::failure e) | ||||
|             { | ||||
|   | ||||
| @@ -140,22 +140,22 @@ private: | ||||
|  | ||||
|     // remaining code phase and carrier phase between tracking loops | ||||
|     double d_rem_code_phase_samples; | ||||
|     float d_rem_carr_phase_rad; | ||||
|     double d_rem_carr_phase_rad; | ||||
|  | ||||
|     // PLL and DLL filter library | ||||
|     Tracking_2nd_DLL_filter d_code_loop_filter; | ||||
|     Tracking_2nd_PLL_filter d_carrier_loop_filter; | ||||
|  | ||||
|     // acquisition | ||||
|     float d_acq_code_phase_samples; | ||||
|     float d_acq_carrier_doppler_hz; | ||||
|     double d_acq_code_phase_samples; | ||||
|     double d_acq_carrier_doppler_hz; | ||||
|  | ||||
|     // tracking vars | ||||
|     double d_code_freq_chips; | ||||
|     float d_carrier_doppler_hz; | ||||
|     float d_acc_carrier_phase_rad; | ||||
|     float d_code_phase_samples; | ||||
|     float d_acc_code_phase_secs; | ||||
|     double d_carrier_doppler_hz; | ||||
|     double d_acc_carrier_phase_rad; | ||||
|     double d_code_phase_samples; | ||||
|     double d_acc_code_phase_secs; | ||||
|  | ||||
|     //PRN period in samples | ||||
|     int d_current_prn_length_samples; | ||||
| @@ -167,9 +167,9 @@ private: | ||||
|     // CN0 estimation and lock detector | ||||
|     int d_cn0_estimation_counter; | ||||
|     gr_complex* d_Prompt_buffer; | ||||
|     float d_carrier_lock_test; | ||||
|     float d_CN0_SNV_dB_Hz; | ||||
|     float d_carrier_lock_threshold; | ||||
|     double d_carrier_lock_test; | ||||
|     double d_CN0_SNV_dB_Hz; | ||||
|     double d_carrier_lock_threshold; | ||||
|     int d_carrier_lock_fail_counter; | ||||
|  | ||||
|     // control vars | ||||
|   | ||||
| @@ -41,103 +41,6 @@ | ||||
|  | ||||
| #define ACCUM_N 128 | ||||
|  | ||||
| __global__ void scalarProdGPUCPXxN_shifts_chips( | ||||
|     GPU_Complex *d_corr_out, | ||||
|     GPU_Complex *d_sig_in, | ||||
|     GPU_Complex *d_local_code_in, | ||||
|     float *d_shifts_chips, | ||||
|     float code_length_chips, | ||||
|     float code_phase_step_chips, | ||||
|     float rem_code_phase_chips, | ||||
|     int vectorN, | ||||
|     int elementN | ||||
| ) | ||||
| { | ||||
|     //Accumulators cache | ||||
|     __shared__ GPU_Complex accumResult[ACCUM_N]; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     // Cycle through every pair of vectors, | ||||
|     // taking into account that vector counts can be different | ||||
|     // from total number of thread blocks | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) | ||||
|     { | ||||
|         //int vectorBase = IMUL(elementN, vec); | ||||
|         //int vectorEnd  = elementN; | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Each accumulator cycles through vectors with | ||||
|         // stride equal to number of total number of accumulators ACCUM_N | ||||
|         // At this stage ACCUM_N is only preferred be a multiple of warp size | ||||
|         // to meet memory coalescing alignment constraints. | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) | ||||
|         { | ||||
|         	GPU_Complex sum = GPU_Complex(0,0); | ||||
|  | ||||
|             for (int pos = iAccum; pos < elementN; pos += ACCUM_N) | ||||
|             { | ||||
|             	//original sample code | ||||
|                 //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]); | ||||
|  | ||||
|             	//custom code for multitap correlator | ||||
|             	// 1.resample local code for the current shift | ||||
|             	float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); | ||||
|             	//Take into account that in multitap correlators, the shifts can be negative! | ||||
|             	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips; | ||||
|             	//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index); | ||||
|             	// 2.correlate | ||||
|             	sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]); | ||||
|  | ||||
|             } | ||||
|             accumResult[iAccum] = sum; | ||||
|         } | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Perform tree-like reduction of accumulators' results. | ||||
|         // ACCUM_N has to be power of two at this stage | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) | ||||
|         { | ||||
|             __syncthreads(); | ||||
|  | ||||
|             for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) | ||||
|             { | ||||
|                 accumResult[iAccum] += accumResult[stride + iAccum]; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if (threadIdx.x == 0) | ||||
|         	{ | ||||
|         		d_corr_out[vec] = accumResult[0]; | ||||
|         	} | ||||
|     } | ||||
| } | ||||
|  | ||||
| /** | ||||
|  * CUDA Kernel Device code | ||||
|  * | ||||
|  * Computes the carrier Doppler wipe-off by integrating the NCO in the CUDA kernel | ||||
|  */ | ||||
| __global__ void | ||||
| CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements) | ||||
| { | ||||
| 	// CUDA version of floating point NCO and vector dot product integrated | ||||
|     float sin; | ||||
|     float cos; | ||||
|     for (int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|          i < numElements; | ||||
|          i += blockDim.x * gridDim.x) | ||||
|     { | ||||
|     	__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos); | ||||
|     	sig_out[i] =  sig_in[i] * GPU_Complex(cos,-sin); | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips( | ||||
|     GPU_Complex *d_corr_out, | ||||
|     GPU_Complex *d_sig_in, | ||||
| @@ -398,37 +301,15 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
| 		int n_correlators) | ||||
| 	{ | ||||
|  | ||||
|  | ||||
| 	// cudaMemCpy version | ||||
| 	//size_t memSize = signal_length_samples * sizeof(std::complex<float>); | ||||
| 	// input signal CPU -> GPU copy memory | ||||
|     //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize, | ||||
|     //                               cudaMemcpyHostToDevice, stream2); | ||||
|  | ||||
|     //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** | ||||
|     //Launch carrier wipe-off kernel here, while local codes are being copied to GPU! | ||||
|     //cudaStreamSynchronize(stream2); | ||||
|  | ||||
|     //CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples); | ||||
|  | ||||
|     //wait for Doppler wipeoff end... | ||||
|     //cudaStreamSynchronize(stream1); | ||||
|     //cudaStreamSynchronize(stream2); | ||||
|  | ||||
|     //launch the multitap correlator with integrated local code resampler! | ||||
|  | ||||
| //    scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>( | ||||
| //			d_corr_out, | ||||
| //			d_sig_doppler_wiped, | ||||
| //			d_local_codes_in, | ||||
| //			d_shifts_chips, | ||||
| //			d_code_length_chips, | ||||
| //	        code_phase_step_chips, | ||||
| //	        rem_code_phase_chips, | ||||
| //			n_correlators, | ||||
| //			signal_length_samples | ||||
| //		); | ||||
|  | ||||
|     Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>( | ||||
| 			d_corr_out, | ||||
| 			d_sig_in, | ||||
| @@ -444,25 +325,15 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
| 			phase_step_rad | ||||
| 			); | ||||
|  | ||||
|     //debug | ||||
| //	std::complex<float>* debug_signal; | ||||
| //	debug_signal=static_cast<std::complex<float>*>(malloc(memSize)); | ||||
| //    cudaMemcpyAsync(debug_signal, d_sig_doppler_wiped, memSize, | ||||
| //            cudaMemcpyDeviceToHost,stream1); | ||||
| //    cudaStreamSynchronize(stream1); | ||||
| //	std::cout<<"d_sig_doppler_wiped GPU="<<debug_signal[456]<<","<<debug_signal[1]<<","<<debug_signal[2]<<","<<debug_signal[3]<<std::endl; | ||||
|  | ||||
|     //cudaGetLastError(); | ||||
|     //wait for correlators end... | ||||
|     //cudaStreamSynchronize(stream1); | ||||
|     cudaStreamSynchronize(stream1); | ||||
| 	// cudaMemCpy version | ||||
|     // Copy the device result vector in device memory to the host result vector | ||||
|     // in host memory. | ||||
|  | ||||
|     //scalar products (correlators outputs) | ||||
|     //cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators, | ||||
|     //        cudaMemcpyDeviceToHost,stream1); | ||||
|  | ||||
|     cudaStreamSynchronize(stream1); | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| @@ -490,7 +361,6 @@ bool cuda_multicorrelator::free_cuda() | ||||
| 	if (d_corr_out!=NULL) cudaFree(d_corr_out); | ||||
| 	if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples); | ||||
| 	if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips); | ||||
|  | ||||
|     // Reset the device and exit | ||||
|     // cudaDeviceReset causes the driver to clean up all state. While | ||||
|     // not mandatory in normal operation, it is good practice.  It is also | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Javier Arribas
					Javier Arribas