mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-31 15:23:04 +00:00 
			
		
		
		
	Updated CUDA kernels and several GPU tracking optimizations.
Bug fix in GPS_L1_CA_DLL_PLL binary dump
This commit is contained in:
		| @@ -165,7 +165,7 @@ Resampler.sample_freq_out=4000000 | ||||
|  | ||||
| ;######### CHANNELS GLOBAL CONFIG ############ | ||||
| ;#count: Number of available GPS satellite channels. | ||||
| Channels_GPS.count=8 | ||||
| Channels_GPS.count=1 | ||||
| ;#count: Number of available Galileo satellite channels. | ||||
| Channels_Galileo.count=0 | ||||
| ;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver | ||||
| @@ -229,16 +229,16 @@ Tracking_GPS.item_type=gr_complex | ||||
| Tracking_GPS.if=0 | ||||
|  | ||||
| ;#dump: Enable or disable the Tracking internal binary data file logging [true] or [false]  | ||||
| Tracking_GPS.dump=false | ||||
| Tracking_GPS.dump=true | ||||
|  | ||||
| ;#dump_filename: Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number. | ||||
| Tracking_GPS.dump_filename=../data/epl_tracking_ch_ | ||||
|  | ||||
| ;#pll_bw_hz: PLL loop filter bandwidth [Hz] | ||||
| Tracking_GPS.pll_bw_hz=45.0; | ||||
| Tracking_GPS.pll_bw_hz=55.0; | ||||
|  | ||||
| ;#dll_bw_hz: DLL loop filter bandwidth [Hz] | ||||
| Tracking_GPS.dll_bw_hz=2.0; | ||||
| Tracking_GPS.dll_bw_hz=1.5 | ||||
|  | ||||
| ;#fll_bw_hz: FLL loop filter bandwidth [Hz] | ||||
| Tracking_GPS.fll_bw_hz=10.0; | ||||
|   | ||||
| @@ -29,13 +29,13 @@ GNSS-SDR.SUPL_CI=0x31b0 | ||||
| SignalSource.implementation=Flexiband_Signal_Source | ||||
|  | ||||
| SignalSource.flag_read_file=true | ||||
| SignalSource.signal_file=/datalogger/captures/eclipse/eclipse_IIIa_2.bin | ||||
| SignalSource.signal_file=/datalogger/L125_III1b_210s.usb | ||||
|  | ||||
| ;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version. | ||||
| SignalSource.item_type=gr_complex | ||||
|  | ||||
| ;# FPGA firmware file | ||||
| SignalSource.firmware_file=flexiband_III-1a.bit | ||||
| SignalSource.firmware_file=flexiband_III-1b.bit | ||||
|  | ||||
| ;#RF_channels: Number of RF channels present in the frontend device, must agree the FPGA firmware file | ||||
| SignalSource.RF_channels=1 | ||||
|   | ||||
| @@ -28,9 +28,9 @@ GNSS-SDR.SUPL_CI=0x31b0 | ||||
| ;#implementation: Use [File_Signal_Source] or [UHD_Signal_Source] or [GN3S_Signal_Source] (experimental) | ||||
| SignalSource.implementation=Flexiband_Signal_Source | ||||
|  | ||||
| SignalSource.flag_read_file=false | ||||
| #SignalSource.signal_file=/datalogger/signals/Fraunhofer/L125_III1b_210s.usb | ||||
| SignalSource.signal_file=/datalogger/captures/flexiband_III_1b_cap1.usb | ||||
| SignalSource.flag_read_file=true | ||||
| SignalSource.signal_file=/datalogger/L125_III1b_210s.usb | ||||
| #SignalSource.signal_file=/datalogger/captures/flexiband_III_1b_cap1.usb | ||||
|  | ||||
| ;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version. | ||||
| SignalSource.item_type=gr_complex | ||||
| @@ -136,8 +136,8 @@ InputFilter0.grid_density=16 | ||||
| InputFilter0.sampling_frequency=20000000 | ||||
| ;# IF deviation due to front-end LO inaccuracies [HZ] | ||||
| ;# WARNING: Fraunhofer front-end hardwareconfigurations can difer. Signals available on http://www.iis.fraunhofer.de/de/ff/lok/leist/test/flexiband.html are centered on 0 Hz, ALL BANDS. | ||||
| InputFilter0.IF=-205000 | ||||
| ;#InputFilter0.IF=0 | ||||
| ;#InputFilter0.IF=-205000 | ||||
| InputFilter0.IF=0 | ||||
|  | ||||
| ;# Decimation factor after the frequency tranaslating block | ||||
| InputFilter0.decimation_factor=8 | ||||
| @@ -230,8 +230,8 @@ InputFilter1.grid_density=16 | ||||
| InputFilter1.sampling_frequency=20000000 | ||||
| ;# IF deviation due to front-end LO inaccuracies [HZ] | ||||
| ;# WARNING: Fraunhofer front-end hardwareconfigurations can difer. Signals available on http://www.iis.fraunhofer.de/de/ff/lok/leist/test/flexiband.html are centered on 0 Hz, ALL BANDS. | ||||
| InputFilter1.IF=100000 | ||||
| ;#InputFilter1.IF=0 | ||||
| ;#InputFilter1.IF=100000 | ||||
| InputFilter1.IF=0 | ||||
|  | ||||
| ;# Decimation factor after the frequency tranaslating block | ||||
| InputFilter1.decimation_factor=8 | ||||
| @@ -272,7 +272,7 @@ Resampler2.implementation=Pass_Through | ||||
| ;#count: Number of available GPS satellite channels. | ||||
| Channels_1C.count=8 | ||||
| Channels_1B.count=1 | ||||
| Channels_2S.count=8 | ||||
| Channels_2S.count=1 | ||||
| ;#count: Number of available Galileo satellite channels. | ||||
| ;Channels_Galileo.count=0 | ||||
| ;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver | ||||
| @@ -378,13 +378,13 @@ Acquisition_1C.max_dwells=1 | ||||
|  | ||||
| ;#implementation: Selected tracking algorithm: [GPS_L1_CA_DLL_PLL_Tracking] or [GPS_L1_CA_DLL_FLL_PLL_Tracking] | ||||
|  | ||||
| Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_Tracking | ||||
| Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_Tracking_GPU | ||||
| Tracking_1C.item_type=gr_complex | ||||
| Tracking_1C.if=0 | ||||
| Tracking_1C.dump=true | ||||
| Tracking_1C.dump_filename=./tracking_ch_ | ||||
| Tracking_1C.dump=false | ||||
| Tracking_1C.dump_filename=../data/epl_tracking_ch_ | ||||
| Tracking_1C.pll_bw_hz=40.0; | ||||
| Tracking_1C.dll_bw_hz=3.0; | ||||
| Tracking_1C.dll_bw_hz=1.5; | ||||
| Tracking_1C.fll_bw_hz=10.0; | ||||
| Tracking_1C.order=3; | ||||
| Tracking_1C.early_late_space_chips=0.5; | ||||
| @@ -405,7 +405,7 @@ Acquisition_2S.max_dwells=1 | ||||
| Tracking_2S.implementation=GPS_L2_M_DLL_PLL_Tracking | ||||
| Tracking_2S.item_type=gr_complex | ||||
| Tracking_2S.if=0 | ||||
| Tracking_2S.dump=true | ||||
| Tracking_2S.dump=false | ||||
| Tracking_2S.dump_filename=./tracking_ch_ | ||||
| Tracking_2S.pll_bw_hz=1.5; | ||||
| Tracking_2S.dll_bw_hz=0.3; | ||||
| @@ -447,7 +447,7 @@ Tracking_1B.item_type=gr_complex | ||||
| Tracking_1B.if=0 | ||||
|  | ||||
| ;#dump: Enable or disable the Tracking internal binary data file logging [true] or [false]  | ||||
| Tracking_1B.dump=true | ||||
| Tracking_1B.dump=false | ||||
|  | ||||
| ;#dump_filename: Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number. | ||||
| Tracking_1B.dump_filename=./veml_tracking_ch_ | ||||
| @@ -497,7 +497,7 @@ TelemetryDecoder_1B.decimation_factor=5; | ||||
| Observables.implementation=Mixed_Observables | ||||
|  | ||||
| ;#dump: Enable or disable the Observables internal binary data file logging [true] or [false]  | ||||
| Observables.dump=true | ||||
| Observables.dump=false | ||||
|  | ||||
| ;#dump_filename: Log path and filename. | ||||
| Observables.dump_filename=./observables.dat | ||||
|   | ||||
| @@ -135,7 +135,8 @@ InputFilter0.grid_density=16 | ||||
| ; i.e. using front-end-cal as reported here:http://www.cttc.es/publication/turning-a-television-into-a-gnss-receiver/ | ||||
| InputFilter0.sampling_frequency=20000000 | ||||
| ;# IF deviation due to front-end LO inaccuracies [HZ] | ||||
| InputFilter0.IF=-205000 | ||||
| ;#InputFilter0.IF=-205000 | ||||
| InputFilter0.IF=0 | ||||
|  | ||||
| ;# Decimation factor after the frequency tranaslating block | ||||
| InputFilter0.decimation_factor=4 | ||||
|   | ||||
| @@ -58,7 +58,7 @@ if(ENABLE_FLEXIBAND) | ||||
|      if(OS_IS_MACOSX) | ||||
|           set(MACOSX_ARGS "-DCMAKE_CXX_COMPILER=/usr/bin/clang++") | ||||
|      endif(OS_IS_MACOSX) | ||||
|     find_package(teleorbit REQUIRED) | ||||
|     find_package(Teleorbit REQUIRED) | ||||
|     if(NOT TELEORBIT_FOUND) | ||||
|         message(FATAL_ERROR "Teleorbit Flexiband GNURadio driver required to build gnss-sdr with the optional FLEXIBAND adapter") | ||||
|     endif(NOT TELEORBIT_FOUND) | ||||
|   | ||||
| @@ -18,6 +18,7 @@ | ||||
|  | ||||
| if(ENABLE_CUDA) | ||||
| 	FIND_PACKAGE(CUDA REQUIRED)  | ||||
| 	set(OPT_TRACKING_ADAPTERS ${OPT_TRACKING_ADAPTERS} gps_l1_ca_dll_pll_tracking_gpu.cc) | ||||
| endif(ENABLE_CUDA) | ||||
|  | ||||
| set(TRACKING_ADAPTER_SOURCES  | ||||
| @@ -30,7 +31,7 @@ set(TRACKING_ADAPTER_SOURCES | ||||
|      gps_l1_ca_tcp_connector_tracking.cc | ||||
|      galileo_e5a_dll_pll_tracking.cc | ||||
|      gps_l2_m_dll_pll_tracking.cc | ||||
|      gps_l1_ca_dll_pll_tracking_gpu.cc | ||||
|      ${OPT_TRACKING_ADAPTERS} | ||||
| ) | ||||
|  | ||||
| include_directories( | ||||
|   | ||||
| @@ -19,6 +19,7 @@ | ||||
|  | ||||
| if(ENABLE_CUDA) | ||||
| 	FIND_PACKAGE(CUDA REQUIRED) | ||||
| 	set(OPT_TRACKING_BLOCKS ${OPT_TRACKING_BLOCKS} gps_l1_ca_dll_pll_tracking_gpu_cc.cc) | ||||
| endif(ENABLE_CUDA) | ||||
|  | ||||
| set(TRACKING_GR_BLOCKS_SOURCES | ||||
| @@ -31,7 +32,7 @@ set(TRACKING_GR_BLOCKS_SOURCES | ||||
|      gps_l1_ca_tcp_connector_tracking_cc.cc | ||||
|      galileo_e5a_dll_pll_tracking_cc.cc | ||||
|      gps_l2_m_dll_pll_tracking_cc.cc | ||||
|      gps_l1_ca_dll_pll_tracking_gpu_cc.cc | ||||
| 	 ${OPT_TRACKING_BLOCKS}    | ||||
| ) | ||||
|  | ||||
| include_directories( | ||||
|   | ||||
| @@ -594,7 +594,8 @@ int Gps_L1_Ca_Dll_Pll_Tracking_cc::general_work (int noutput_items, gr_vector_in | ||||
|  | ||||
|                     // carrier and code frequency | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(float)); | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&d_code_freq_chips), sizeof(float)); | ||||
|                     tmp_float=d_code_freq_chips; | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float)); | ||||
|  | ||||
|                     //PLL commands | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&carr_error_hz), sizeof(float)); | ||||
|   | ||||
| @@ -81,7 +81,6 @@ gps_l1_ca_dll_pll_make_tracking_gpu_cc( | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::forecast (int noutput_items, | ||||
|         gr_vector_int &ninput_items_required) | ||||
| { | ||||
| @@ -120,14 +119,19 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc( | ||||
|  | ||||
|     // Initialization of local code replica | ||||
|     // Get space for a vector with the C/A code replica sampled 1x/chip | ||||
|     d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS + 2) * sizeof(gr_complex), volk_get_alignment())); | ||||
|  | ||||
|     //d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS + 2) * sizeof(gr_complex), volk_get_alignment())); | ||||
|     d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment())); | ||||
|  | ||||
|     multicorrelator_gpu = new cuda_multicorrelator(); | ||||
|     int N_CORRELATORS=3; | ||||
|     multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS); | ||||
|     //local code resampler on CPU (old) | ||||
|     //multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS); | ||||
|  | ||||
|     //local code resampler on GPU (new) | ||||
|     multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS); | ||||
|  | ||||
|     // Get space for the resampled early / prompt / late local replicas | ||||
| 	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_samples, N_CORRELATORS * sizeof(int),  cudaHostAllocMapped )); | ||||
| 	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped )); | ||||
|  | ||||
|  | ||||
|     //allocate host memory | ||||
| @@ -138,7 +142,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc( | ||||
| 	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined )); | ||||
|  | ||||
| 	//new integrated shifts | ||||
| 	checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined )); | ||||
| 	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined )); | ||||
|  | ||||
| 	// correlator outputs (scalar) | ||||
| 	checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined )); | ||||
| @@ -242,9 +246,13 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking() | ||||
|     d_code_loop_filter.initialize();    // initialize the code filter | ||||
|  | ||||
|     // generate local reference ALWAYS starting at chip 1 (1 sample per chip) | ||||
|     gps_l1_ca_code_gen_complex(&d_ca_code[1], d_acquisition_gnss_synchro->PRN, 0); | ||||
|     d_ca_code[0] = d_ca_code[static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS)]; | ||||
|     d_ca_code[static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS) + 1] = d_ca_code[1]; | ||||
|     gps_l1_ca_code_gen_complex(d_ca_code, d_acquisition_gnss_synchro->PRN, 0); | ||||
|  | ||||
|     d_local_code_shift_chips[0]=-d_early_late_spc_chips; | ||||
|     d_local_code_shift_chips[1]=0.0; | ||||
|     d_local_code_shift_chips[2]=d_early_late_spc_chips; | ||||
|  | ||||
|     multicorrelator_gpu->set_local_code_and_taps(GPS_L1_CA_CODE_LENGTH_CHIPS,d_ca_code, d_local_code_shift_chips,3); | ||||
|  | ||||
|     d_carrier_lock_fail_counter = 0; | ||||
|     d_rem_code_phase_samples = 0; | ||||
| @@ -272,40 +280,6 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking() | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
|  | ||||
| void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::update_local_code() | ||||
| { | ||||
|     double tcode_chips; | ||||
|     double rem_code_phase_chips; | ||||
|     int associated_chip_index; | ||||
|     int code_length_chips = static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS); | ||||
|     double code_phase_step_chips; | ||||
|     int epl_loop_length_samples; | ||||
|  | ||||
|     // unified loop for E, P, L code vectors | ||||
|     code_phase_step_chips = static_cast<double>(d_code_freq_chips) / static_cast<double>(d_fs_in); | ||||
|     rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in); | ||||
|     tcode_chips = -rem_code_phase_chips; | ||||
|  | ||||
|     // Alternative EPL code generation (40% of speed improvement!) | ||||
|     d_local_code_shift_samples[0]=0; | ||||
|     d_local_code_shift_samples[1]=round(d_early_late_spc_chips / code_phase_step_chips); | ||||
|     d_local_code_shift_samples[2]=round((2*d_early_late_spc_chips) / code_phase_step_chips); | ||||
|  | ||||
|     epl_loop_length_samples = d_current_prn_length_samples + d_local_code_shift_samples[2]; //maximum length | ||||
|  | ||||
|     for (int i = 0; i < epl_loop_length_samples; i++) | ||||
|         { | ||||
|             associated_chip_index = 1 + round(fmod(tcode_chips - d_early_late_spc_chips, code_length_chips)); | ||||
|             d_local_codes_gpu[i] = d_ca_code[associated_chip_index]; | ||||
|             tcode_chips = tcode_chips + code_phase_step_chips; | ||||
|         } | ||||
|  | ||||
| } | ||||
|  | ||||
|  | ||||
| Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc() | ||||
| { | ||||
|     d_dump_file.close(); | ||||
| @@ -313,7 +287,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc() | ||||
| 	cudaFreeHost(in_gpu); | ||||
| 	cudaFreeHost(d_carr_sign_gpu); | ||||
| 	cudaFreeHost(d_corr_outs_gpu); | ||||
| 	cudaFreeHost(d_local_codes_gpu); | ||||
| 	cudaFreeHost(d_local_code_shift_chips); | ||||
|  | ||||
| 	multicorrelator_gpu->free_cuda(); | ||||
| 	delete(multicorrelator_gpu); | ||||
| @@ -329,10 +303,10 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|         gr_vector_const_void_star &input_items, gr_vector_void_star &output_items) | ||||
| { | ||||
|     // process vars | ||||
|     float carr_error_hz; | ||||
|     float carr_error_filt_hz; | ||||
|     float code_error_chips; | ||||
|     float code_error_filt_chips; | ||||
|     float carr_error_hz=0.0; | ||||
|     float carr_error_filt_hz=0.0; | ||||
|     float code_error_chips=0.0; | ||||
|     float code_error_filt_chips=0.0; | ||||
|  | ||||
|     // Block input data and block output stream pointers | ||||
|     const gr_complex* in = (gr_complex*) input_items[0]; | ||||
| @@ -341,23 +315,17 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|     // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder | ||||
|     Gnss_Synchro current_synchro_data = Gnss_Synchro(); | ||||
|  | ||||
|  | ||||
|     if (d_enable_tracking == true) | ||||
|         { | ||||
|             // Receiver signal alignment | ||||
|             if (d_pull_in == true) | ||||
|                 { | ||||
|                     int samples_offset; | ||||
|                     float acq_trk_shif_correction_samples; | ||||
|                     int acq_to_trk_delay_samples; | ||||
|                     acq_to_trk_delay_samples = d_sample_counter - d_acq_sample_stamp; | ||||
|                     acq_trk_shif_correction_samples = d_current_prn_length_samples - fmod(static_cast<float>(acq_to_trk_delay_samples), static_cast<float>(d_current_prn_length_samples)); | ||||
|                     samples_offset = round(d_acq_code_phase_samples + acq_trk_shif_correction_samples); | ||||
|                     // /todo: Check if the sample counter sent to the next block as a time reference should be incremented AFTER sended or BEFORE | ||||
|                     //d_sample_counter_seconds = d_sample_counter_seconds + (((double)samples_offset) / static_cast<double>(d_fs_in)); | ||||
|                     samples_offset = round(d_acq_code_phase_samples)+d_current_prn_length_samples - acq_to_trk_delay_samples%d_current_prn_length_samples; | ||||
|                     d_sample_counter = d_sample_counter + samples_offset; //count for the processed samples | ||||
|                     d_pull_in = false; | ||||
|                     //std::cout<<" samples_offset="<<samples_offset<<"\r\n"; | ||||
|                     // Fill the acquisition data | ||||
|                     current_synchro_data = *d_acquisition_gnss_synchro; | ||||
|                     *out[0] = current_synchro_data; | ||||
| @@ -368,46 +336,24 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|             // Fill the acquisition data | ||||
|             current_synchro_data = *d_acquisition_gnss_synchro; | ||||
|  | ||||
|             // Generate local code and carrier replicas (using \hat{f}_d(k-1)) | ||||
|             update_local_code(); | ||||
|  | ||||
|             // UPDATE NCO COMMAND | ||||
|             float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in); | ||||
|             //std::cout<<"d_current_prn_length_samples="<<d_current_prn_length_samples<<std::endl; | ||||
|             // perform carrier wipe-off and compute Early, Prompt and Late correlation | ||||
|  | ||||
|         	//code resampler on GPU (new) | ||||
|             float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in); | ||||
|             float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in); | ||||
|  | ||||
|             cudaProfilerStart(); | ||||
|             multicorrelator_gpu->Carrier_wipeoff_multicorrelator_cuda( | ||||
|             multicorrelator_gpu->Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
|     				d_corr_outs_gpu, | ||||
|     				in, | ||||
|     				d_local_codes_gpu, | ||||
|     				d_rem_carr_phase_rad, | ||||
|     				phase_step_rad, | ||||
|     				d_local_code_shift_samples, | ||||
|     				code_phase_step_chips, | ||||
|     				rem_code_phase_chips, | ||||
|     				d_current_prn_length_samples, | ||||
|     				3); | ||||
|         	cudaProfilerStop(); | ||||
|             //std::cout<<"d_Prompt="<<*d_Prompt<<"d_Early="<<*d_Early<<"d_Late="<<*d_Late<<std::endl; | ||||
|             // check for samples consistency (this should be done before in the receiver / here only if the source is a file) | ||||
|             if (std::isnan((*d_Prompt).real()) == true or std::isnan((*d_Prompt).imag()) == true ) // or std::isinf(in[i].real())==true or std::isinf(in[i].imag())==true) | ||||
|                 { | ||||
|                     const int samples_available = ninput_items[0]; | ||||
|                     d_sample_counter = d_sample_counter + samples_available; | ||||
|                     LOG(WARNING) << "Detected NaN samples at sample number " << d_sample_counter; | ||||
|                     consume_each(samples_available); | ||||
|  | ||||
|                     // make an output to not stop the rest of the processing blocks | ||||
|                     current_synchro_data.Prompt_I = 0.0; | ||||
|                     current_synchro_data.Prompt_Q = 0.0; | ||||
|                     current_synchro_data.Tracking_timestamp_secs = static_cast<double>(d_sample_counter) / static_cast<double>(d_fs_in); | ||||
|                     current_synchro_data.Carrier_phase_rads = 0.0; | ||||
|                     current_synchro_data.Code_phase_secs = 0.0; | ||||
|                     current_synchro_data.CN0_dB_hz = 0.0; | ||||
|                     current_synchro_data.Flag_valid_tracking = false; | ||||
|                     current_synchro_data.Flag_valid_pseudorange = false; | ||||
|  | ||||
|                     *out[0] = current_synchro_data; | ||||
|                     return 1; | ||||
|                 } | ||||
|  | ||||
|             // ################## PLL ########################################################## | ||||
|             // PLL discriminator | ||||
| @@ -444,8 +390,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|             T_chip_seconds = 1 / static_cast<double>(d_code_freq_chips); | ||||
|             T_prn_seconds = T_chip_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS; | ||||
|             T_prn_samples = T_prn_seconds * static_cast<double>(d_fs_in); | ||||
|             K_blk_samples = T_prn_samples + d_rem_code_phase_samples + code_error_filt_secs * static_cast<double>(d_fs_in); | ||||
|             d_current_prn_length_samples = round(K_blk_samples); //round to a discrete samples | ||||
|             K_blk_samples = T_prn_samples + d_rem_code_phase_samples + static_cast<double>(code_error_filt_secs) * static_cast<double>(d_fs_in); | ||||
|             //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample | ||||
|  | ||||
|             // ####### CN0 ESTIMATION AND LOCK DETECTORS ###### | ||||
| @@ -591,7 +536,8 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|  | ||||
|                     // carrier and code frequency | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(float)); | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&d_code_freq_chips), sizeof(float)); | ||||
|                     tmp_float=d_code_freq_chips; | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float)); | ||||
|  | ||||
|                     //PLL commands | ||||
|                     d_dump_file.write(reinterpret_cast<char*>(&carr_error_hz), sizeof(float)); | ||||
|   | ||||
| @@ -130,7 +130,7 @@ private: | ||||
|     gr_complex* in_gpu; | ||||
|     gr_complex* d_carr_sign_gpu; | ||||
|     gr_complex* d_local_codes_gpu; | ||||
| 	int* d_local_code_shift_samples; | ||||
| 	float* d_local_code_shift_chips; | ||||
|     gr_complex* d_corr_outs_gpu; | ||||
|     cuda_multicorrelator *multicorrelator_gpu; | ||||
|  | ||||
|   | ||||
| @@ -33,7 +33,6 @@ if(ENABLE_CUDA) | ||||
|  | ||||
| 	SET(LIB_TYPE STATIC) #set the lib type | ||||
| 	CUDA_ADD_LIBRARY(CUDA_CORRELATOR_LIB ${LIB_TYPE} cuda_multicorrelator.h cuda_multicorrelator.cu) | ||||
|      | ||||
| endif(ENABLE_CUDA) | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -53,7 +53,83 @@ | ||||
| #include <helper_cuda.h> | ||||
| #include <helper_functions.h> | ||||
|  | ||||
| #define ACCUM_N 1024 | ||||
| #define ACCUM_N 256 | ||||
|  | ||||
|  | ||||
| __global__ void scalarProdGPUCPXxN_shifts_chips( | ||||
|     GPU_Complex *d_corr_out, | ||||
|     GPU_Complex *d_sig_in, | ||||
|     GPU_Complex *d_local_code_in, | ||||
|     float *d_shifts_chips, | ||||
|     float code_length_chips, | ||||
|     float code_phase_step_chips, | ||||
|     float rem_code_phase_chips, | ||||
|     int vectorN, | ||||
|     int elementN | ||||
| ) | ||||
| { | ||||
|     //Accumulators cache | ||||
|     __shared__ GPU_Complex accumResult[ACCUM_N]; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     // Cycle through every pair of vectors, | ||||
|     // taking into account that vector counts can be different | ||||
|     // from total number of thread blocks | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) | ||||
|     { | ||||
|         //int vectorBase = IMUL(elementN, vec); | ||||
|         //int vectorEnd  = elementN; | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Each accumulator cycles through vectors with | ||||
|         // stride equal to number of total number of accumulators ACCUM_N | ||||
|         // At this stage ACCUM_N is only preferred be a multiple of warp size | ||||
|         // to meet memory coalescing alignment constraints. | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) | ||||
|         { | ||||
|         	GPU_Complex sum = GPU_Complex(0,0); | ||||
|  | ||||
|             for (int pos = iAccum; pos < elementN; pos += ACCUM_N) | ||||
|             { | ||||
|                 //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]); | ||||
|  | ||||
|             	// 1.resample local code for the current shift | ||||
|             	float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); | ||||
|             	//TODO: Take into account that in multitap correlators, the shifts can be negative! | ||||
|             	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips; | ||||
|  | ||||
|             	// 2.correlate | ||||
|             	sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]); | ||||
|  | ||||
|             } | ||||
|             accumResult[iAccum] = sum; | ||||
|         } | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Perform tree-like reduction of accumulators' results. | ||||
|         // ACCUM_N has to be power of two at this stage | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) | ||||
|         { | ||||
|             __syncthreads(); | ||||
|  | ||||
|             for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) | ||||
|             { | ||||
|                 accumResult[iAccum] += accumResult[stride + iAccum]; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if (threadIdx.x == 0) | ||||
|         	{ | ||||
|         		d_corr_out[vec] = accumResult[0]; | ||||
|         	} | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////////////////// | ||||
| // Calculate scalar products of VectorN vectors of ElementN elements on GPU | ||||
| @@ -145,8 +221,9 @@ __global__ void scalarProdGPUCPXxN( | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) | ||||
|     { | ||||
|         int vectorBase = IMUL(elementN, vec); | ||||
|         int vectorEnd  = vectorBase + elementN; | ||||
|         //int vectorBase = IMUL(elementN, vec); | ||||
|         //int vectorEnd  = vectorBase + elementN; | ||||
|  | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Each accumulator cycles through vectors with | ||||
| @@ -158,11 +235,13 @@ __global__ void scalarProdGPUCPXxN( | ||||
|         { | ||||
|         	GPU_Complex sum = GPU_Complex(0,0); | ||||
|  | ||||
|             for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) | ||||
|             //for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) | ||||
|         	for (int pos = iAccum; pos < elementN; pos += ACCUM_N) | ||||
|             { | ||||
|                 //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]); | ||||
|             	//sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]); | ||||
|         		sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos]); | ||||
|             } | ||||
|             accumResult[iAccum] = sum; | ||||
|         } | ||||
| @@ -200,9 +279,9 @@ __global__ void scalarProdGPUCPXxN( | ||||
|  */ | ||||
| __global__ void CUDA_32fc_x2_multiply_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements) | ||||
| { | ||||
|     int i = blockDim.x * blockIdx.x + threadIdx.x; | ||||
|  | ||||
|     if (i < numElements) | ||||
|     for (int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|          i < numElements; | ||||
|          i += blockDim.x * gridDim.x) | ||||
|     { | ||||
|         C[i] =  A[i] * B[i]; | ||||
|     } | ||||
| @@ -232,10 +311,11 @@ CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem | ||||
|  | ||||
| 	// CUDA version of floating point NCO and vector dot product integrated | ||||
|  | ||||
|     int i = blockDim.x * blockIdx.x + threadIdx.x; | ||||
|     float sin; | ||||
|     float cos; | ||||
|     if (i < numElements) | ||||
|     for (int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|          i < numElements; | ||||
|          i += blockDim.x * gridDim.x) | ||||
|     { | ||||
|     	__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos); | ||||
|     	sig_out[i] =  sig_in[i] * GPU_Complex(cos,-sin); | ||||
| @@ -252,11 +332,11 @@ CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem | ||||
| __global__ void | ||||
| CUDA_32fc_x2_add_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements) | ||||
| { | ||||
|     int i = blockDim.x * blockIdx.x + threadIdx.x; | ||||
|  | ||||
|     if (i < numElements) | ||||
|     for (int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|          i < numElements; | ||||
|          i += blockDim.x * gridDim.x) | ||||
|     { | ||||
|         C[i] =  A[i] * B[i]; | ||||
|         C[i] =  A[i] + B[i]; | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -264,23 +344,53 @@ CUDA_32fc_x2_add_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int | ||||
| bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators) | ||||
| { | ||||
| 	// use command-line specified CUDA device, otherwise use device with highest Gflops/s | ||||
| 	findCudaDevice(argc, (const char **)argv); | ||||
|  | ||||
|     cudaDeviceProp  prop; | ||||
|     int whichDevice; | ||||
|     cudaGetDevice( &whichDevice ); | ||||
|     cudaGetDeviceProperties( &prop, whichDevice ); | ||||
|     //debug code | ||||
|     if (prop.canMapHostMemory != 1) { | ||||
|         printf( "Device can not map memory.\n" ); | ||||
|     } | ||||
|  | ||||
|     printf("L2 Cache size= %u \n",prop.l2CacheSize); | ||||
|     printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); | ||||
|     printf("maxGridSize= %i \n",prop.maxGridSize[0]); | ||||
|     printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); | ||||
|     printf("deviceOverlap= %i \n",prop.deviceOverlap); | ||||
|     //end debug code | ||||
| //	findCudaDevice(argc, (const char **)argv); | ||||
| //      cudaDeviceProp  prop; | ||||
| //    int num_devices, device; | ||||
| //    cudaGetDeviceCount(&num_devices); | ||||
| //    if (num_devices > 1) { | ||||
| //          int max_multiprocessors = 0, max_device = 0; | ||||
| //          for (device = 0; device < num_devices; device++) { | ||||
| //                  cudaDeviceProp properties; | ||||
| //                  cudaGetDeviceProperties(&properties, device); | ||||
| //                  if (max_multiprocessors < properties.multiProcessorCount) { | ||||
| //                          max_multiprocessors = properties.multiProcessorCount; | ||||
| //                          max_device = device; | ||||
| //                  } | ||||
| //                  printf("Found GPU device # %i\n",device); | ||||
| //          } | ||||
| //          //cudaSetDevice(max_device); | ||||
| // | ||||
| //          //set random device! | ||||
| //          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs | ||||
| // | ||||
| //          cudaGetDeviceProperties( &prop, max_device ); | ||||
| //          //debug code | ||||
| //          if (prop.canMapHostMemory != 1) { | ||||
| //              printf( "Device can not map memory.\n" ); | ||||
| //          } | ||||
| //          printf("L2 Cache size= %u \n",prop.l2CacheSize); | ||||
| //          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); | ||||
| //          printf("maxGridSize= %i \n",prop.maxGridSize[0]); | ||||
| //          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); | ||||
| //          printf("deviceOverlap= %i \n",prop.deviceOverlap); | ||||
| //  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount); | ||||
| //    }else{ | ||||
| //    	    int whichDevice; | ||||
| //    	    cudaGetDevice( &whichDevice ); | ||||
| //    	    cudaGetDeviceProperties( &prop, whichDevice ); | ||||
| //    	    //debug code | ||||
| //    	    if (prop.canMapHostMemory != 1) { | ||||
| //    	        printf( "Device can not map memory.\n" ); | ||||
| //    	    } | ||||
| // | ||||
| //    	    printf("L2 Cache size= %u \n",prop.l2CacheSize); | ||||
| //    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); | ||||
| //    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]); | ||||
| //    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); | ||||
| //    	    printf("deviceOverlap= %i \n",prop.deviceOverlap); | ||||
| //    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount); | ||||
| //    } | ||||
|  | ||||
| 	//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); | ||||
|  | ||||
| @@ -300,7 +410,7 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign | ||||
| 	// Required: The last correlator tap in d_shifts_samples has the largest sample shift | ||||
|     size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex); | ||||
| 	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes)); | ||||
| 	checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, size+sizeof(int)*n_correlators)); | ||||
| 	checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators)); | ||||
|  | ||||
| 	//scalars | ||||
| 	checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators)); | ||||
| @@ -315,6 +425,116 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign | ||||
| } | ||||
|  | ||||
|  | ||||
| bool cuda_multicorrelator::init_cuda_integrated_resampler( | ||||
| 		const int argc, const char **argv, | ||||
| 		int signal_length_samples, | ||||
| 		int code_length_chips, | ||||
| 		int n_correlators | ||||
| 		) | ||||
| { | ||||
| 	// use command-line specified CUDA device, otherwise use device with highest Gflops/s | ||||
| //	findCudaDevice(argc, (const char **)argv); | ||||
| //      cudaDeviceProp  prop; | ||||
| //    int num_devices, device; | ||||
| //    cudaGetDeviceCount(&num_devices); | ||||
| //    if (num_devices > 1) { | ||||
| //          int max_multiprocessors = 0, max_device = 0; | ||||
| //          for (device = 0; device < num_devices; device++) { | ||||
| //                  cudaDeviceProp properties; | ||||
| //                  cudaGetDeviceProperties(&properties, device); | ||||
| //                  if (max_multiprocessors < properties.multiProcessorCount) { | ||||
| //                          max_multiprocessors = properties.multiProcessorCount; | ||||
| //                          max_device = device; | ||||
| //                  } | ||||
| //                  printf("Found GPU device # %i\n",device); | ||||
| //          } | ||||
| //          //cudaSetDevice(max_device); | ||||
| // | ||||
| //          //set random device! | ||||
| //          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs | ||||
| // | ||||
| //          cudaGetDeviceProperties( &prop, max_device ); | ||||
| //          //debug code | ||||
| //          if (prop.canMapHostMemory != 1) { | ||||
| //              printf( "Device can not map memory.\n" ); | ||||
| //          } | ||||
| //          printf("L2 Cache size= %u \n",prop.l2CacheSize); | ||||
| //          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); | ||||
| //          printf("maxGridSize= %i \n",prop.maxGridSize[0]); | ||||
| //          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); | ||||
| //          printf("deviceOverlap= %i \n",prop.deviceOverlap); | ||||
| //  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount); | ||||
| //    }else{ | ||||
| //    	    int whichDevice; | ||||
| //    	    cudaGetDevice( &whichDevice ); | ||||
| //    	    cudaGetDeviceProperties( &prop, whichDevice ); | ||||
| //    	    //debug code | ||||
| //    	    if (prop.canMapHostMemory != 1) { | ||||
| //    	        printf( "Device can not map memory.\n" ); | ||||
| //    	    } | ||||
| // | ||||
| //    	    printf("L2 Cache size= %u \n",prop.l2CacheSize); | ||||
| //    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); | ||||
| //    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]); | ||||
| //    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); | ||||
| //    	    printf("deviceOverlap= %i \n",prop.deviceOverlap); | ||||
| //    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount); | ||||
| //    } | ||||
|  | ||||
| 	//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); | ||||
|  | ||||
|     // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors | ||||
|  | ||||
|     size_t size = signal_length_samples * sizeof(GPU_Complex); | ||||
|  | ||||
| 	checkCudaErrors(cudaMalloc((void **)&d_sig_in, size)); | ||||
| 	checkCudaErrors(cudaMemset(d_sig_in,0,size)); | ||||
|  | ||||
| 	//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size)); | ||||
| 	checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size)); | ||||
| 	checkCudaErrors(cudaMemset(d_sig_doppler_wiped,0,size)); | ||||
|  | ||||
| 	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips)); | ||||
| 	checkCudaErrors(cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips)); | ||||
|  | ||||
|     d_code_length_chips=code_length_chips; | ||||
|  | ||||
| 	checkCudaErrors(cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators)); | ||||
| 	checkCudaErrors(cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators)); | ||||
|  | ||||
| 	//scalars | ||||
| 	checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators)); | ||||
| 	checkCudaErrors(cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators)); | ||||
|  | ||||
|     // Launch the Vector Add CUDA Kernel | ||||
| 	threadsPerBlock = 256; | ||||
|     blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; | ||||
|  | ||||
| 	cudaStreamCreate (&stream1) ; | ||||
| 	cudaStreamCreate (&stream2) ; | ||||
| 	return true; | ||||
| } | ||||
|  | ||||
| bool cuda_multicorrelator::set_local_code_and_taps( | ||||
| 		int code_length_chips, | ||||
| 		const std::complex<float>* local_codes_in, | ||||
| 		float *shifts_chips, | ||||
| 		int n_correlators | ||||
| 		) | ||||
| { | ||||
|     // local code CPU -> GPU copy memory | ||||
|     checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1)); | ||||
|     d_code_length_chips=(float)code_length_chips; | ||||
|  | ||||
|     // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) | ||||
|     checkCudaErrors(cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators, | ||||
|                                     cudaMemcpyHostToDevice,stream1)); | ||||
|  | ||||
| 	return true; | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( | ||||
| 		std::complex<float>* corr_out, | ||||
| 		const std::complex<float>* sig_in, | ||||
| @@ -396,13 +616,88 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
| 		std::complex<float>* corr_out, | ||||
| 		const std::complex<float>* sig_in, | ||||
| 		float rem_carrier_phase_in_rad, | ||||
| 		float phase_step_rad, | ||||
|         float code_phase_step_chips, | ||||
|         float rem_code_phase_chips, | ||||
| 		int signal_length_samples, | ||||
| 		int n_correlators) | ||||
| 	{ | ||||
|  | ||||
| 	size_t memSize = signal_length_samples * sizeof(std::complex<float>); | ||||
| 	// input signal CPU -> GPU copy memory | ||||
|     checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize, | ||||
|                                     cudaMemcpyHostToDevice, stream2)); | ||||
|  | ||||
|     //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** | ||||
|  | ||||
|     //Launch carrier wipe-off kernel here, while local codes are being copied to GPU! | ||||
|     checkCudaErrors(cudaStreamSynchronize(stream2)); | ||||
|  | ||||
|     CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples); | ||||
|  | ||||
|     //wait for Doppler wipeoff end... | ||||
|     checkCudaErrors(cudaStreamSynchronize(stream1)); | ||||
|     checkCudaErrors(cudaStreamSynchronize(stream2)); | ||||
|  | ||||
|     //launch the multitap correlator with integrated local code resampler! | ||||
|  | ||||
|     scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>( | ||||
| 			d_corr_out, | ||||
| 			d_sig_doppler_wiped, | ||||
| 			d_local_codes_in, | ||||
| 			d_shifts_chips, | ||||
| 			d_code_length_chips, | ||||
| 	        code_phase_step_chips, | ||||
| 	        rem_code_phase_chips, | ||||
| 			n_correlators, | ||||
| 			signal_length_samples | ||||
| 		); | ||||
|  | ||||
|     checkCudaErrors(cudaGetLastError()); | ||||
|     //wait for correlators end... | ||||
|     checkCudaErrors(cudaStreamSynchronize(stream1)); | ||||
|     // Copy the device result vector in device memory to the host result vector | ||||
|     // in host memory. | ||||
|  | ||||
|     //scalar products (correlators outputs) | ||||
|     checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators, | ||||
|             cudaMemcpyDeviceToHost,stream1)); | ||||
|     checkCudaErrors(cudaStreamSynchronize(stream1)); | ||||
|     return true; | ||||
| } | ||||
|  | ||||
|  | ||||
| cuda_multicorrelator::cuda_multicorrelator() | ||||
| { | ||||
| 	d_sig_in=NULL; | ||||
| 	d_nco_in=NULL; | ||||
| 	d_sig_doppler_wiped=NULL; | ||||
| 	d_local_codes_in=NULL; | ||||
| 	d_shifts_samples=NULL; | ||||
| 	d_shifts_chips=NULL; | ||||
| 	d_corr_out=NULL; | ||||
| 	threadsPerBlock=0; | ||||
| 	blocksPerGrid=0; | ||||
| 	d_code_length_chips=0; | ||||
| } | ||||
|  | ||||
| bool cuda_multicorrelator::free_cuda() | ||||
| { | ||||
| 	// Free device global memory | ||||
| 	cudaFree(d_sig_in); | ||||
| 	//cudaFree(d_nco_in); | ||||
| 	cudaFree(d_local_codes_in); | ||||
| 	cudaFree(d_corr_out); | ||||
| 	if (d_sig_in!=NULL) cudaFree(d_sig_in); | ||||
| 	if (d_nco_in!=NULL) cudaFree(d_nco_in); | ||||
| 	if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped); | ||||
| 	if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in); | ||||
| 	if (d_corr_out!=NULL) cudaFree(d_corr_out); | ||||
|  | ||||
|  | ||||
| 	if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples); | ||||
| 	if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips); | ||||
|  | ||||
|  | ||||
| 	cudaStreamDestroy(stream1) ; | ||||
| 	cudaStreamDestroy(stream2) ; | ||||
|   | ||||
| @@ -113,8 +113,20 @@ struct GPU_Complex_Short { | ||||
| class cuda_multicorrelator | ||||
| { | ||||
| public: | ||||
| 	cuda_multicorrelator(); | ||||
| 	bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators); | ||||
|  | ||||
| 	bool init_cuda_integrated_resampler( | ||||
| 			const int argc, const char **argv, | ||||
| 			int signal_length_samples, | ||||
| 			int code_length_chips, | ||||
| 			int n_correlators | ||||
| 			); | ||||
| 	bool set_local_code_and_taps( | ||||
| 			int code_length_chips, | ||||
| 			const std::complex<float>* local_codes_in, | ||||
| 			float *shifts_chips, | ||||
| 			int n_correlators | ||||
| 			); | ||||
| 	bool free_cuda(); | ||||
| 	bool Carrier_wipeoff_multicorrelator_cuda( | ||||
| 			std::complex<float>* corr_out, | ||||
| @@ -125,6 +137,15 @@ public: | ||||
| 			const int *shifts_samples, | ||||
| 			int signal_length_samples, | ||||
| 			int n_correlators); | ||||
| 	bool Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
| 			std::complex<float>* corr_out, | ||||
| 			const std::complex<float>* sig_in, | ||||
| 			float rem_carrier_phase_in_rad, | ||||
| 			float phase_step_rad, | ||||
| 	        float code_phase_step_chips, | ||||
| 	        float rem_code_phase_chips, | ||||
| 			int signal_length_samples, | ||||
| 			int n_correlators); | ||||
| private: | ||||
| 	// Allocate the device input vectors | ||||
| 	GPU_Complex *d_sig_in; | ||||
| @@ -133,6 +154,9 @@ private: | ||||
| 	GPU_Complex *d_local_codes_in; | ||||
| 	GPU_Complex *d_corr_out; | ||||
| 	int *d_shifts_samples; | ||||
| 	float *d_shifts_chips; | ||||
| 	float d_code_length_chips; | ||||
|  | ||||
| 	int threadsPerBlock; | ||||
| 	int blocksPerGrid; | ||||
|  | ||||
|   | ||||
| @@ -94,7 +94,7 @@ Tracking_2nd_PLL_filter::Tracking_2nd_PLL_filter () | ||||
| { | ||||
|     //--- PLL variables -------------------------------------------------------- | ||||
|     d_pdi_carr = 0.001;// Summation interval for carrier | ||||
|     d_plldampingratio = 0.65; | ||||
|     d_plldampingratio = 0.7; | ||||
| } | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -18,6 +18,7 @@ | ||||
|  | ||||
| if(ENABLE_CUDA) | ||||
| 	FIND_PACKAGE(CUDA REQUIRED) | ||||
| 	add_definitions(-DCUDA_GPU_ACCEL=1) | ||||
| endif(ENABLE_CUDA) | ||||
|  | ||||
| set(GNSS_RECEIVER_SOURCES | ||||
|   | ||||
| @@ -1610,12 +1610,14 @@ std::unique_ptr<TrackingInterface> GNSSBlockFactory::GetTrkBlock( | ||||
|                     out_streams, queue)); | ||||
|             block = std::move(block_); | ||||
|         } | ||||
| #if CUDA_GPU_ACCEL | ||||
|     else if (implementation.compare("GPS_L1_CA_DLL_PLL_Tracking_GPU") == 0) | ||||
|         { | ||||
|             std::unique_ptr<TrackingInterface> block_(new GpsL1CaDllPllTrackingGPU(configuration.get(), role, in_streams, | ||||
|                     out_streams, queue)); | ||||
|             block = std::move(block_); | ||||
|         } | ||||
| #endif | ||||
|     else | ||||
|         { | ||||
|             // Log fatal. This causes execution to stop. | ||||
|   | ||||
| @@ -33,6 +33,12 @@ if(ENABLE_UHD) | ||||
|     set(GNSS_SDR_OPTIONAL_HEADERS ${GNSS_SDR_OPTIONAL_HEADERS} ${UHD_INCLUDE_DIRS}) | ||||
| endif(ENABLE_UHD) | ||||
|  | ||||
| if(ENABLE_CUDA) | ||||
| 	FIND_PACKAGE(CUDA REQUIRED) | ||||
|     add_definitions(-DCUDA_GPU_ACCEL=1) | ||||
| endif(ENABLE_CUDA) | ||||
|  | ||||
|  | ||||
| include_directories( | ||||
|      ${CMAKE_SOURCE_DIR}/src/core/system_parameters | ||||
|      ${CMAKE_SOURCE_DIR}/src/core/interfaces | ||||
| @@ -48,6 +54,7 @@ include_directories( | ||||
|      ${GNURADIO_RUNTIME_INCLUDE_DIRS} | ||||
|      ${GNSS_SDR_OPTIONAL_HEADERS} | ||||
|      ${VOLK_GNSSSDR_INCLUDE_DIRS} | ||||
|      ${CUDA_INCLUDE_DIRS} | ||||
| ) | ||||
|  | ||||
| add_definitions( -DGNSS_SDR_VERSION="${VERSION}" ) | ||||
| @@ -79,6 +86,7 @@ target_link_libraries(gnss-sdr ${MAC_LIBRARIES} | ||||
|                                ${GNSS_SDR_OPTIONAL_LIBS} | ||||
|                                gnss_sp_libs | ||||
|                                gnss_rx | ||||
|                                ${CUDA_LIBRARIES} | ||||
|                                ) | ||||
|  | ||||
|  | ||||
|   | ||||
| @@ -68,6 +68,11 @@ | ||||
| #include "sbas_ephemeris.h" | ||||
| #include "sbas_time.h" | ||||
|  | ||||
| #if CUDA_GPU_ACCEL | ||||
| 	// For the CUDA runtime routines (prefixed with "cuda_") | ||||
| 	#include <cuda_runtime.h> | ||||
| #endif | ||||
|  | ||||
|  | ||||
| using google::LogMessage; | ||||
|  | ||||
| @@ -143,6 +148,17 @@ int main(int argc, char** argv) | ||||
|     google::ParseCommandLineFlags(&argc, &argv, true); | ||||
|     std::cout << "Initializing GNSS-SDR v" << gnss_sdr_version << " ... Please wait." << std::endl; | ||||
|  | ||||
| 	#if CUDA_GPU_ACCEL | ||||
| 		// Reset the device | ||||
| 		// cudaDeviceReset causes the driver to clean up all state. While | ||||
| 		// not mandatory in normal operation, it is good practice.  It is also | ||||
| 		// needed to ensure correct operation when the application is being | ||||
| 		// profiled. Calling cudaDeviceReset causes all profile data to be | ||||
| 		// flushed before the application exits | ||||
| 		cudaDeviceReset(); | ||||
| 		 std::cout << "Reset CUDA device done " << std::endl; | ||||
| 	#endif | ||||
|  | ||||
|     if(GOOGLE_STRIP_LOG == 0) | ||||
|         { | ||||
|             google::InitGoogleLogging(argv[0]); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Javier Arribas
					Javier Arribas