mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-30 23:03:05 +00:00 
			
		
		
		
	Some CUDA cleaning and documentation
This commit is contained in:
		| @@ -51,7 +51,6 @@ class ConfigurationInterface; | ||||
| class GpsL1CaDllPllTrackingGPU : public TrackingInterface | ||||
| { | ||||
| public: | ||||
|  | ||||
|   GpsL1CaDllPllTrackingGPU(ConfigurationInterface* configuration, | ||||
|             std::string role, | ||||
|             unsigned int in_streams, | ||||
| @@ -65,7 +64,7 @@ public: | ||||
|         return role_; | ||||
|     } | ||||
|  | ||||
|     //! Returns "GPS_L1_CA_DLL_PLL_Tracking" | ||||
|     //! Returns "GPS_L1_CA_DLL_PLL_Tracking_GPU" | ||||
|     std::string implementation() | ||||
|     { | ||||
|         return "GPS_L1_CA_DLL_PLL_Tracking_GPU"; | ||||
|   | ||||
| @@ -123,7 +123,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc( | ||||
|     d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment())); | ||||
|  | ||||
|     multicorrelator_gpu = new cuda_multicorrelator(); | ||||
|     int N_CORRELATORS=3; | ||||
|     int N_CORRELATORS = 3; | ||||
|     //local code resampler on CPU (old) | ||||
|     //multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS); | ||||
|  | ||||
| @@ -131,22 +131,22 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc( | ||||
|     multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS); | ||||
|  | ||||
|     // Get space for the resampled early / prompt / late local replicas | ||||
| 	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped )); | ||||
|     checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped )); | ||||
|  | ||||
|  | ||||
|     //allocate host memory | ||||
|     //pinned memory mode - use special function to get OS-pinned memory | ||||
| 	checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped )); | ||||
|     checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped )); | ||||
|  | ||||
| 	//old local codes vector | ||||
| 	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined )); | ||||
|     //old local codes vector | ||||
|     //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined )); | ||||
|  | ||||
| 	//new integrated shifts | ||||
| 	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined )); | ||||
|     //new integrated shifts | ||||
|     //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined )); | ||||
|  | ||||
| 	// correlator outputs (scalar) | ||||
| 	checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined )); | ||||
| 	//map to EPL pointers | ||||
|     // correlator outputs (scalar) | ||||
|     checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined )); | ||||
|     //map to EPL pointers | ||||
|     d_Early = &d_corr_outs_gpu[0]; | ||||
|     d_Prompt =  &d_corr_outs_gpu[1]; | ||||
|     d_Late = &d_corr_outs_gpu[2]; | ||||
| @@ -284,13 +284,13 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc() | ||||
| { | ||||
|     d_dump_file.close(); | ||||
|  | ||||
| 	cudaFreeHost(in_gpu); | ||||
| 	cudaFreeHost(d_carr_sign_gpu); | ||||
| 	cudaFreeHost(d_corr_outs_gpu); | ||||
| 	cudaFreeHost(d_local_code_shift_chips); | ||||
|     cudaFreeHost(in_gpu); | ||||
|     cudaFreeHost(d_carr_sign_gpu); | ||||
|     cudaFreeHost(d_corr_outs_gpu); | ||||
|     cudaFreeHost(d_local_code_shift_chips); | ||||
|  | ||||
| 	multicorrelator_gpu->free_cuda(); | ||||
| 	delete(multicorrelator_gpu); | ||||
|     multicorrelator_gpu->free_cuda(); | ||||
|     delete(multicorrelator_gpu); | ||||
|  | ||||
|     volk_free(d_ca_code); | ||||
|  | ||||
| @@ -339,7 +339,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|             // UPDATE NCO COMMAND | ||||
|             float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in); | ||||
|  | ||||
|         	//code resampler on GPU (new) | ||||
|             //code resampler on GPU (new) | ||||
|             float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in); | ||||
|             float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in); | ||||
|  | ||||
| @@ -353,7 +353,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto | ||||
|     				rem_code_phase_chips, | ||||
|     				d_current_prn_length_samples, | ||||
|     				3); | ||||
|         	cudaProfilerStop(); | ||||
|             cudaProfilerStop(); | ||||
|  | ||||
|             // ################## PLL ########################################################## | ||||
|             // PLL discriminator | ||||
|   | ||||
| @@ -130,7 +130,7 @@ private: | ||||
|     gr_complex* in_gpu; | ||||
|     gr_complex* d_carr_sign_gpu; | ||||
|     gr_complex* d_local_codes_gpu; | ||||
| 	float* d_local_code_shift_chips; | ||||
|     float* d_local_code_shift_chips; | ||||
|     gr_complex* d_corr_outs_gpu; | ||||
|     cuda_multicorrelator *multicorrelator_gpu; | ||||
|  | ||||
|   | ||||
| @@ -45,67 +45,67 @@ | ||||
| #endif | ||||
|  | ||||
| #include <complex> | ||||
|  | ||||
| #include <cuda.h> | ||||
| // CUDA runtime | ||||
| #include <cuda_runtime.h> | ||||
|  | ||||
| // GPU new internal data types for complex numbers | ||||
|  | ||||
| struct GPU_Complex { | ||||
| 	 float r; | ||||
| 	 float i; | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {}; | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {} | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { | ||||
| 		 return r * r + i * i; | ||||
| 	 } | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) { | ||||
| 		#ifdef __CUDACC__ | ||||
| 		 return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i)); | ||||
| 		#else | ||||
| 		 return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i); | ||||
| 		#endif | ||||
| 	 } | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) { | ||||
| 		 return GPU_Complex(r+a.r, i+a.i); | ||||
| 	 } | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) { | ||||
| 		 r+=a.r; | ||||
| 		 i+=a.i; | ||||
| 	 } | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b) | ||||
| 	 { | ||||
| 		 //c=a*b+c | ||||
| 		 //real part | ||||
| 		 //c.r=(a.r*b.r - a.i*b.i)+c.r | ||||
| 		#ifdef __CUDACC__ | ||||
| 			 r=__fmaf_rn(a.r,b.r,r); | ||||
| 			 r=__fmaf_rn(-a.i,b.i,r); | ||||
| 			 //imag part | ||||
| 			 i=__fmaf_rn(a.i,b.r,i); | ||||
| 			 i=__fmaf_rn(a.r,b.i,i); | ||||
| 		#else | ||||
| 			 r=(a.r*b.r - a.i*b.i)+r; | ||||
| 			 i=(a.i*b.r - a.r*b.i)+i; | ||||
| 		#endif | ||||
| struct GPU_Complex | ||||
| { | ||||
|     float r; | ||||
|     float i; | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {}; | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {} | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { return r * r + i * i; } | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) | ||||
|     { | ||||
| #ifdef __CUDACC__ | ||||
|         return GPU_Complex(__fmul_rn(r, a.r) - __fmul_rn(i, a.i), __fmul_rn(i, a.r) + __fmul_rn(r, a.i)); | ||||
| #else | ||||
|         return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i); | ||||
| #endif | ||||
|     } | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) | ||||
|     { | ||||
|         return GPU_Complex(r + a.r, i + a.i); | ||||
|     } | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) { r += a.r; i += a.i; } | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b) | ||||
|     { | ||||
|         //c=a*b+c | ||||
|         //real part | ||||
|         //c.r=(a.r*b.r - a.i*b.i)+c.r | ||||
| #ifdef __CUDACC__ | ||||
|         r = __fmaf_rn(a.r, b.r, r); | ||||
|         r = __fmaf_rn(-a.i, b.i, r); | ||||
|         //imag part | ||||
|         i = __fmaf_rn(a.i, b.r, i); | ||||
|         i = __fmaf_rn(a.r, b.i, i); | ||||
| #else | ||||
|         r = (a.r*b.r - a.i*b.i) + r; | ||||
|         i = (a.i*b.r - a.r*b.i) + i; | ||||
| #endif | ||||
|  | ||||
| 	 } | ||||
|     } | ||||
| }; | ||||
|  | ||||
| struct GPU_Complex_Short { | ||||
| 	 float r; | ||||
| 	 float i; | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {} | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { | ||||
| 		 return r * r + i * i; | ||||
| 	 } | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) { | ||||
| 		 return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i); | ||||
| 	 } | ||||
| 	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) { | ||||
| 		 return GPU_Complex_Short(r+a.r, i+a.i); | ||||
| 	 } | ||||
| struct GPU_Complex_Short | ||||
| { | ||||
|     float r; | ||||
|     float i; | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {} | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) | ||||
|     { | ||||
|         return r * r + i * i; | ||||
|     } | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) | ||||
|     { | ||||
|         return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i); | ||||
|     } | ||||
|     CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) | ||||
|     { | ||||
|         return GPU_Complex_Short(r+a.r, i+a.i); | ||||
|     } | ||||
| }; | ||||
| /*! | ||||
|  * \brief Class that implements carrier wipe-off and correlators using NVIDIA CUDA GPU accelerators. | ||||
| @@ -113,58 +113,58 @@ struct GPU_Complex_Short { | ||||
| class cuda_multicorrelator | ||||
| { | ||||
| public: | ||||
| 	cuda_multicorrelator(); | ||||
| 	bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators); | ||||
| 	bool init_cuda_integrated_resampler( | ||||
| 			const int argc, const char **argv, | ||||
| 			int signal_length_samples, | ||||
| 			int code_length_chips, | ||||
| 			int n_correlators | ||||
| 			); | ||||
| 	bool set_local_code_and_taps( | ||||
| 			int code_length_chips, | ||||
| 			const std::complex<float>* local_codes_in, | ||||
| 			float *shifts_chips, | ||||
| 			int n_correlators | ||||
| 			); | ||||
| 	bool free_cuda(); | ||||
| 	bool Carrier_wipeoff_multicorrelator_cuda( | ||||
| 			std::complex<float>* corr_out, | ||||
| 			const std::complex<float>* sig_in, | ||||
| 			const std::complex<float>* local_codes_in, | ||||
| 			float rem_carrier_phase_in_rad, | ||||
| 			float phase_step_rad, | ||||
| 			const int *shifts_samples, | ||||
| 			int signal_length_samples, | ||||
| 			int n_correlators); | ||||
| 	bool Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
| 			std::complex<float>* corr_out, | ||||
| 			const std::complex<float>* sig_in, | ||||
| 			float rem_carrier_phase_in_rad, | ||||
| 			float phase_step_rad, | ||||
| 	        float code_phase_step_chips, | ||||
| 	        float rem_code_phase_chips, | ||||
| 			int signal_length_samples, | ||||
| 			int n_correlators); | ||||
|     cuda_multicorrelator(); | ||||
|     bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators); | ||||
|     bool init_cuda_integrated_resampler( | ||||
|             const int argc, const char **argv, | ||||
|             int signal_length_samples, | ||||
|             int code_length_chips, | ||||
|             int n_correlators | ||||
|     ); | ||||
|     bool set_local_code_and_taps( | ||||
|             int code_length_chips, | ||||
|             const std::complex<float>* local_codes_in, | ||||
|             float *shifts_chips, | ||||
|             int n_correlators | ||||
|     ); | ||||
|     bool free_cuda(); | ||||
|     bool Carrier_wipeoff_multicorrelator_cuda( | ||||
|             std::complex<float>* corr_out, | ||||
|             const std::complex<float>* sig_in, | ||||
|             const std::complex<float>* local_codes_in, | ||||
|             float rem_carrier_phase_in_rad, | ||||
|             float phase_step_rad, | ||||
|             const int *shifts_samples, | ||||
|             int signal_length_samples, | ||||
|             int n_correlators); | ||||
|     bool Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
|             std::complex<float>* corr_out, | ||||
|             const std::complex<float>* sig_in, | ||||
|             float rem_carrier_phase_in_rad, | ||||
|             float phase_step_rad, | ||||
|             float code_phase_step_chips, | ||||
|             float rem_code_phase_chips, | ||||
|             int signal_length_samples, | ||||
|             int n_correlators); | ||||
|  | ||||
| private: | ||||
| 	// Allocate the device input vectors | ||||
| 	GPU_Complex *d_sig_in; | ||||
| 	GPU_Complex *d_nco_in; | ||||
| 	GPU_Complex *d_sig_doppler_wiped; | ||||
| 	GPU_Complex *d_local_codes_in; | ||||
| 	GPU_Complex *d_corr_out; | ||||
| 	int *d_shifts_samples; | ||||
| 	float *d_shifts_chips; | ||||
| 	float d_code_length_chips; | ||||
|     // Allocate the device input vectors | ||||
|     GPU_Complex *d_sig_in; | ||||
|     GPU_Complex *d_nco_in; | ||||
|     GPU_Complex *d_sig_doppler_wiped; | ||||
|     GPU_Complex *d_local_codes_in; | ||||
|     GPU_Complex *d_corr_out; | ||||
|     int *d_shifts_samples; | ||||
|     float *d_shifts_chips; | ||||
|     float d_code_length_chips; | ||||
|  | ||||
| 	int threadsPerBlock; | ||||
| 	int blocksPerGrid; | ||||
|  | ||||
| 	cudaStream_t stream1; | ||||
| 	cudaStream_t stream2; | ||||
| 	int num_gpu_devices; | ||||
| 	int selected_device; | ||||
|     int threadsPerBlock; | ||||
|     int blocksPerGrid; | ||||
|  | ||||
|     cudaStream_t stream1; | ||||
|     cudaStream_t stream2; | ||||
|     int num_gpu_devices; | ||||
|     int selected_device; | ||||
| }; | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Carles Fernandez
					Carles Fernandez