Some CUDA cleaning and documentation

2025-10-27 13:37:38 +00:00 · 2015-08-25 19:01:02 +02:00
parent 429e4e8776
commit ef136e5c74
6 changed files with 154 additions and 138 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,6 +47,7 @@ option(ENABLE_GPROF "Enable the use of the GNU profiler tool 'gprof'" OFF)

 # Acceleration
 option(ENABLE_OPENCL "Enable building of processing blocks implemented with OpenCL (experimental)" OFF)
+option(ENABLE_CUDA "Enable building of processing blocks implemented with CUDA (experimental, requires CUDA SDK)" OFF)

 # Building and packaging options
 option(ENABLE_GENERIC_ARCH "Builds a portable binary" OFF)
@@ -883,6 +884,24 @@ else(ENABLE_OPENCL)
 endif(ENABLE_OPENCL)


+###############################################################################
+# CUDA (OPTIONAL)
+###############################################################################
+if($ENV{CUDA_GPU_ACCEL})
+    message(STATUS "CUDA_GPU_ACCEL environment variable found." )
+    set(ENABLE_CUDA ON)
+endif($ENV{CUDA_GPU_ACCEL})
+
+if(ENABLE_CUDA)
+    FIND_PACKAGE(CUDA REQUIRED)
+    message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." )
+    message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" )
+else(ENABLE_CUDA)
+    message(STATUS "NVIDIA CUDA GPU Acceleration will be not enabled." )
+    message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for GPU-based acceleration using CUDA." )
+endif(ENABLE_CUDA)
+
+


 ################################################################################
@@ -955,20 +974,6 @@ else(ENABLE_OSMOSDR)
    message(STATUS "Enable it with 'cmake -DENABLE_OSMOSDR=ON ../' to add support for OsmoSDR and other front-ends (HackRF, bladeRF, Realtek's RTL2832U-based USB dongles, etc.)" )
 endif(ENABLE_OSMOSDR)

-if($ENV{CUDA_GPU_ACCEL})
-    message(STATUS "CUDA_GPU_ACCEL environment variable found." )
-    set(ENABLE_CUDA ON)
-endif($ENV{CUDA_GPU_ACCEL})
-
-if(ENABLE_CUDA)
-    FIND_PACKAGE(CUDA REQUIRED)
-    message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." )
-    message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" )
-else(ENABLE_CUDA)
-    message(STATUS "NVIDIA CUDA GPU Acceleration will is not enabled." )
-    message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for GPU-based acceleration using CUDA." )
-endif(ENABLE_CUDA)
-

 if($ENV{FLEXIBAND_DRIVER})
    message(STATUS "FLEXIBAND_DRIVER environment variable found." )
--- a/README.md
+++ b/README.md
@@ -345,6 +345,19 @@ $ sudo make install
 ~~~~~~ 


+###### Build CUDA support (OPTIONAL):
+
+In order to enable the building of blocks that use CUDA, NVIDIA's parallel programming model that enables graphics processing unit (GPU) acceleration for data-parallel computations, first you need to install the CUDA Toolkit from [NVIDIA Developers Download page](https://developer.nvidia.com/cuda-downloads "CUDA Downloads"). Then, build GNSS-SDR doing:
+
+~~~~~~ 
+$ cmake -DENABLE_CUDA=ON ../
+$ make
+$ sudo make install
+~~~~~~ 
+
+Of course, you will also need a GPU that [supports CUDA](https://developer.nvidia.com/cuda-gpus "CUDA GPUs").
+
+
 ###### Build a portable binary

 In order to build an executable that not depends on the specific SIMD instruction set that is present in the processor of the compiling machine, so other users can execute it in other machines without those particular sets, use:
@@ -841,7 +854,6 @@ Tracking_1C.dump=false ; Enable internal binary data file logging [true] or [fal
 Tracking_1C.dump_filename=./tracking_ch_ ; Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number.
 Tracking_1C.pll_bw_hz=50.0 ; PLL loop filter bandwidth [Hz]
 Tracking_1C.dll_bw_hz=2.0 ; DLL loop filter bandwidth [Hz]
-Tracking_1C.fll_bw_hz=10.0 ; FLL loop filter bandwidth [Hz]
 Tracking_1C.order=3 ; PLL/DLL loop filter order [2] or [3]
 Tracking_1C.early_late_space_chips=0.5 ; correlator early-late space [chips]. 
 ~~~~~~ 
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
@@ -51,7 +51,6 @@ class ConfigurationInterface;
 class GpsL1CaDllPllTrackingGPU : public TrackingInterface
 {
 public:
-
  GpsL1CaDllPllTrackingGPU(ConfigurationInterface* configuration,
            std::string role,
            unsigned int in_streams,
@@ -65,7 +64,7 @@ public:
        return role_;
    }

-    //! Returns "GPS_L1_CA_DLL_PLL_Tracking"
+    //! Returns "GPS_L1_CA_DLL_PLL_Tracking_GPU"
    std::string implementation()
    {
        return "GPS_L1_CA_DLL_PLL_Tracking_GPU";
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
@@ -123,7 +123,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment()));

    multicorrelator_gpu = new cuda_multicorrelator();
-    int N_CORRELATORS=3;
+    int N_CORRELATORS = 3;
    //local code resampler on CPU (old)
    //multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS);

@@ -131,22 +131,22 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS);

    // Get space for the resampled early / prompt / late local replicas
-	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped ));
+    checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped ));


    //allocate host memory
    //pinned memory mode - use special function to get OS-pinned memory
-	checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped ));
+    checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped ));

-	//old local codes vector
-	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
+    //old local codes vector
+    //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));

-	//new integrated shifts
-	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
+    //new integrated shifts
+    //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));

-	// correlator outputs (scalar)
-	checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined ));
-	//map to EPL pointers
+    // correlator outputs (scalar)
+    checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined ));
+    //map to EPL pointers
    d_Early = &d_corr_outs_gpu[0];
    d_Prompt =  &d_corr_outs_gpu[1];
    d_Late = &d_corr_outs_gpu[2];
@@ -284,13 +284,13 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
 {
    d_dump_file.close();

-	cudaFreeHost(in_gpu);
-	cudaFreeHost(d_carr_sign_gpu);
-	cudaFreeHost(d_corr_outs_gpu);
-	cudaFreeHost(d_local_code_shift_chips);
+    cudaFreeHost(in_gpu);
+    cudaFreeHost(d_carr_sign_gpu);
+    cudaFreeHost(d_corr_outs_gpu);
+    cudaFreeHost(d_local_code_shift_chips);

-	multicorrelator_gpu->free_cuda();
-	delete(multicorrelator_gpu);
+    multicorrelator_gpu->free_cuda();
+    delete(multicorrelator_gpu);

    volk_free(d_ca_code);

@@ -339,7 +339,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
            // UPDATE NCO COMMAND
            float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);

-        	//code resampler on GPU (new)
+            //code resampler on GPU (new)
            float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in);
            float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in);

@@ -353,7 +353,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
    				rem_code_phase_chips,
    				d_current_prn_length_samples,
    				3);
-        	cudaProfilerStop();
+            cudaProfilerStop();

            // ################## PLL ##########################################################
            // PLL discriminator
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
@@ -130,7 +130,7 @@ private:
    gr_complex* in_gpu;
    gr_complex* d_carr_sign_gpu;
    gr_complex* d_local_codes_gpu;
-	float* d_local_code_shift_chips;
+    float* d_local_code_shift_chips;
    gr_complex* d_corr_outs_gpu;
    cuda_multicorrelator *multicorrelator_gpu;

--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -45,67 +45,67 @@
 #endif

 #include <complex>
-
 #include <cuda.h>
-// CUDA runtime
 #include <cuda_runtime.h>

 // GPU new internal data types for complex numbers

-struct GPU_Complex {
-	 float r;
-	 float i;
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
-	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
-		 return r * r + i * i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) {
-		#ifdef __CUDACC__
-		 return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i));
-		#else
-		 return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
-		#endif
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) {
-		 return GPU_Complex(r+a.r, i+a.i);
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) {
-		 r+=a.r;
-		 i+=a.i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
-	 {
-		 //c=a*b+c
-		 //real part
-		 //c.r=(a.r*b.r - a.i*b.i)+c.r
-		#ifdef __CUDACC__
-			 r=__fmaf_rn(a.r,b.r,r);
-			 r=__fmaf_rn(-a.i,b.i,r);
-			 //imag part
-			 i=__fmaf_rn(a.i,b.r,i);
-			 i=__fmaf_rn(a.r,b.i,i);
-		#else
-			 r=(a.r*b.r - a.i*b.i)+r;
-			 i=(a.i*b.r - a.r*b.i)+i;
-		#endif
+struct GPU_Complex
+{
+    float r;
+    float i;
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
+    CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { return r * r + i * i; }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a)
+    {
+#ifdef __CUDACC__
+        return GPU_Complex(__fmul_rn(r, a.r) - __fmul_rn(i, a.i), __fmul_rn(i, a.r) + __fmul_rn(r, a.i));
+#else
+        return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
+#endif
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a)
+    {
+        return GPU_Complex(r + a.r, i + a.i);
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) { r += a.r; i += a.i; }
+    CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
+    {
+        //c=a*b+c
+        //real part
+        //c.r=(a.r*b.r - a.i*b.i)+c.r
+#ifdef __CUDACC__
+        r = __fmaf_rn(a.r, b.r, r);
+        r = __fmaf_rn(-a.i, b.i, r);
+        //imag part
+        i = __fmaf_rn(a.i, b.r, i);
+        i = __fmaf_rn(a.r, b.i, i);
+#else
+        r = (a.r*b.r - a.i*b.i) + r;
+        i = (a.i*b.r - a.r*b.i) + i;
+#endif

-	 }
+    }
 };

-struct GPU_Complex_Short {
-	 float r;
-	 float i;
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
-	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
-		 return r * r + i * i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) {
-		 return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) {
-		 return GPU_Complex_Short(r+a.r, i+a.i);
-	 }
+struct GPU_Complex_Short
+{
+    float r;
+    float i;
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
+    CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void )
+    {
+        return r * r + i * i;
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a)
+    {
+        return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a)
+    {
+        return GPU_Complex_Short(r+a.r, i+a.i);
+    }
 };
 /*!
 * \brief Class that implements carrier wipe-off and correlators using NVIDIA CUDA GPU accelerators.
@@ -113,58 +113,58 @@ struct GPU_Complex_Short {
 class cuda_multicorrelator
 {
 public:
-	cuda_multicorrelator();
-	bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
-	bool init_cuda_integrated_resampler(
-			const int argc, const char **argv,
-			int signal_length_samples,
-			int code_length_chips,
-			int n_correlators
-			);
-	bool set_local_code_and_taps(
-			int code_length_chips,
-			const std::complex<float>* local_codes_in,
-			float *shifts_chips,
-			int n_correlators
-			);
-	bool free_cuda();
-	bool Carrier_wipeoff_multicorrelator_cuda(
-			std::complex<float>* corr_out,
-			const std::complex<float>* sig_in,
-			const std::complex<float>* local_codes_in,
-			float rem_carrier_phase_in_rad,
-			float phase_step_rad,
-			const int *shifts_samples,
-			int signal_length_samples,
-			int n_correlators);
-	bool Carrier_wipeoff_multicorrelator_resampler_cuda(
-			std::complex<float>* corr_out,
-			const std::complex<float>* sig_in,
-			float rem_carrier_phase_in_rad,
-			float phase_step_rad,
-	        float code_phase_step_chips,
-	        float rem_code_phase_chips,
-			int signal_length_samples,
-			int n_correlators);
+    cuda_multicorrelator();
+    bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
+    bool init_cuda_integrated_resampler(
+            const int argc, const char **argv,
+            int signal_length_samples,
+            int code_length_chips,
+            int n_correlators
+    );
+    bool set_local_code_and_taps(
+            int code_length_chips,
+            const std::complex<float>* local_codes_in,
+            float *shifts_chips,
+            int n_correlators
+    );
+    bool free_cuda();
+    bool Carrier_wipeoff_multicorrelator_cuda(
+            std::complex<float>* corr_out,
+            const std::complex<float>* sig_in,
+            const std::complex<float>* local_codes_in,
+            float rem_carrier_phase_in_rad,
+            float phase_step_rad,
+            const int *shifts_samples,
+            int signal_length_samples,
+            int n_correlators);
+    bool Carrier_wipeoff_multicorrelator_resampler_cuda(
+            std::complex<float>* corr_out,
+            const std::complex<float>* sig_in,
+            float rem_carrier_phase_in_rad,
+            float phase_step_rad,
+            float code_phase_step_chips,
+            float rem_code_phase_chips,
+            int signal_length_samples,
+            int n_correlators);
+
 private:
-	// Allocate the device input vectors
-	GPU_Complex *d_sig_in;
-	GPU_Complex *d_nco_in;
-	GPU_Complex *d_sig_doppler_wiped;
-	GPU_Complex *d_local_codes_in;
-	GPU_Complex *d_corr_out;
-	int *d_shifts_samples;
-	float *d_shifts_chips;
-	float d_code_length_chips;
+    // Allocate the device input vectors
+    GPU_Complex *d_sig_in;
+    GPU_Complex *d_nco_in;
+    GPU_Complex *d_sig_doppler_wiped;
+    GPU_Complex *d_local_codes_in;
+    GPU_Complex *d_corr_out;
+    int *d_shifts_samples;
+    float *d_shifts_chips;
+    float d_code_length_chips;

-	int threadsPerBlock;
-	int blocksPerGrid;
-
-	cudaStream_t stream1;
-	cudaStream_t stream2;
-	int num_gpu_devices;
-	int selected_device;
+    int threadsPerBlock;
+    int blocksPerGrid;

+    cudaStream_t stream1;
+    cudaStream_t stream2;
+    int num_gpu_devices;
+    int selected_device;
 };