Some CUDA cleaning and documentation

2025-10-30 23:03:05 +00:00 · 2015-08-25 19:01:02 +02:00
parent 429e4e8776
commit ef136e5c74
6 changed files with 154 additions and 138 deletions
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
@@ -51,7 +51,6 @@ class ConfigurationInterface;
 class GpsL1CaDllPllTrackingGPU : public TrackingInterface
 {
 public:
-
  GpsL1CaDllPllTrackingGPU(ConfigurationInterface* configuration,
            std::string role,
            unsigned int in_streams,
@@ -65,7 +64,7 @@ public:
        return role_;
    }

-    //! Returns "GPS_L1_CA_DLL_PLL_Tracking"
+    //! Returns "GPS_L1_CA_DLL_PLL_Tracking_GPU"
    std::string implementation()
    {
        return "GPS_L1_CA_DLL_PLL_Tracking_GPU";
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
@@ -123,7 +123,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment()));

    multicorrelator_gpu = new cuda_multicorrelator();
-    int N_CORRELATORS=3;
+    int N_CORRELATORS = 3;
    //local code resampler on CPU (old)
    //multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS);

@@ -131,22 +131,22 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS);

    // Get space for the resampled early / prompt / late local replicas
-	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped ));
+    checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped ));


    //allocate host memory
    //pinned memory mode - use special function to get OS-pinned memory
-	checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped ));
+    checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped ));

-	//old local codes vector
-	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
+    //old local codes vector
+    //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));

-	//new integrated shifts
-	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
+    //new integrated shifts
+    //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));

-	// correlator outputs (scalar)
-	checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined ));
-	//map to EPL pointers
+    // correlator outputs (scalar)
+    checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined ));
+    //map to EPL pointers
    d_Early = &d_corr_outs_gpu[0];
    d_Prompt =  &d_corr_outs_gpu[1];
    d_Late = &d_corr_outs_gpu[2];
@@ -284,13 +284,13 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
 {
    d_dump_file.close();

-	cudaFreeHost(in_gpu);
-	cudaFreeHost(d_carr_sign_gpu);
-	cudaFreeHost(d_corr_outs_gpu);
-	cudaFreeHost(d_local_code_shift_chips);
+    cudaFreeHost(in_gpu);
+    cudaFreeHost(d_carr_sign_gpu);
+    cudaFreeHost(d_corr_outs_gpu);
+    cudaFreeHost(d_local_code_shift_chips);

-	multicorrelator_gpu->free_cuda();
-	delete(multicorrelator_gpu);
+    multicorrelator_gpu->free_cuda();
+    delete(multicorrelator_gpu);

    volk_free(d_ca_code);

@@ -339,7 +339,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
            // UPDATE NCO COMMAND
            float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);

-        	//code resampler on GPU (new)
+            //code resampler on GPU (new)
            float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in);
            float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in);

@@ -353,7 +353,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
    				rem_code_phase_chips,
    				d_current_prn_length_samples,
    				3);
-        	cudaProfilerStop();
+            cudaProfilerStop();

            // ################## PLL ##########################################################
            // PLL discriminator
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
@@ -130,7 +130,7 @@ private:
    gr_complex* in_gpu;
    gr_complex* d_carr_sign_gpu;
    gr_complex* d_local_codes_gpu;
-	float* d_local_code_shift_chips;
+    float* d_local_code_shift_chips;
    gr_complex* d_corr_outs_gpu;
    cuda_multicorrelator *multicorrelator_gpu;

--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -45,67 +45,67 @@
 #endif

 #include <complex>
-
 #include <cuda.h>
-// CUDA runtime
 #include <cuda_runtime.h>

 // GPU new internal data types for complex numbers

-struct GPU_Complex {
-	 float r;
-	 float i;
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
-	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
-		 return r * r + i * i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) {
-		#ifdef __CUDACC__
-		 return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i));
-		#else
-		 return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
-		#endif
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) {
-		 return GPU_Complex(r+a.r, i+a.i);
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) {
-		 r+=a.r;
-		 i+=a.i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
-	 {
-		 //c=a*b+c
-		 //real part
-		 //c.r=(a.r*b.r - a.i*b.i)+c.r
-		#ifdef __CUDACC__
-			 r=__fmaf_rn(a.r,b.r,r);
-			 r=__fmaf_rn(-a.i,b.i,r);
-			 //imag part
-			 i=__fmaf_rn(a.i,b.r,i);
-			 i=__fmaf_rn(a.r,b.i,i);
-		#else
-			 r=(a.r*b.r - a.i*b.i)+r;
-			 i=(a.i*b.r - a.r*b.i)+i;
-		#endif
+struct GPU_Complex
+{
+    float r;
+    float i;
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
+    CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { return r * r + i * i; }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a)
+    {
+#ifdef __CUDACC__
+        return GPU_Complex(__fmul_rn(r, a.r) - __fmul_rn(i, a.i), __fmul_rn(i, a.r) + __fmul_rn(r, a.i));
+#else
+        return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
+#endif
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a)
+    {
+        return GPU_Complex(r + a.r, i + a.i);
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) { r += a.r; i += a.i; }
+    CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
+    {
+        //c=a*b+c
+        //real part
+        //c.r=(a.r*b.r - a.i*b.i)+c.r
+#ifdef __CUDACC__
+        r = __fmaf_rn(a.r, b.r, r);
+        r = __fmaf_rn(-a.i, b.i, r);
+        //imag part
+        i = __fmaf_rn(a.i, b.r, i);
+        i = __fmaf_rn(a.r, b.i, i);
+#else
+        r = (a.r*b.r - a.i*b.i) + r;
+        i = (a.i*b.r - a.r*b.i) + i;
+#endif

-	 }
+    }
 };

-struct GPU_Complex_Short {
-	 float r;
-	 float i;
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
-	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
-		 return r * r + i * i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) {
-		 return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) {
-		 return GPU_Complex_Short(r+a.r, i+a.i);
-	 }
+struct GPU_Complex_Short
+{
+    float r;
+    float i;
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
+    CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void )
+    {
+        return r * r + i * i;
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a)
+    {
+        return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
+    }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a)
+    {
+        return GPU_Complex_Short(r+a.r, i+a.i);
+    }
 };
 /*!
 * \brief Class that implements carrier wipe-off and correlators using NVIDIA CUDA GPU accelerators.
@@ -113,58 +113,58 @@ struct GPU_Complex_Short {
 class cuda_multicorrelator
 {
 public:
-	cuda_multicorrelator();
-	bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
-	bool init_cuda_integrated_resampler(
-			const int argc, const char **argv,
-			int signal_length_samples,
-			int code_length_chips,
-			int n_correlators
-			);
-	bool set_local_code_and_taps(
-			int code_length_chips,
-			const std::complex<float>* local_codes_in,
-			float *shifts_chips,
-			int n_correlators
-			);
-	bool free_cuda();
-	bool Carrier_wipeoff_multicorrelator_cuda(
-			std::complex<float>* corr_out,
-			const std::complex<float>* sig_in,
-			const std::complex<float>* local_codes_in,
-			float rem_carrier_phase_in_rad,
-			float phase_step_rad,
-			const int *shifts_samples,
-			int signal_length_samples,
-			int n_correlators);
-	bool Carrier_wipeoff_multicorrelator_resampler_cuda(
-			std::complex<float>* corr_out,
-			const std::complex<float>* sig_in,
-			float rem_carrier_phase_in_rad,
-			float phase_step_rad,
-	        float code_phase_step_chips,
-	        float rem_code_phase_chips,
-			int signal_length_samples,
-			int n_correlators);
+    cuda_multicorrelator();
+    bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
+    bool init_cuda_integrated_resampler(
+            const int argc, const char **argv,
+            int signal_length_samples,
+            int code_length_chips,
+            int n_correlators
+    );
+    bool set_local_code_and_taps(
+            int code_length_chips,
+            const std::complex<float>* local_codes_in,
+            float *shifts_chips,
+            int n_correlators
+    );
+    bool free_cuda();
+    bool Carrier_wipeoff_multicorrelator_cuda(
+            std::complex<float>* corr_out,
+            const std::complex<float>* sig_in,
+            const std::complex<float>* local_codes_in,
+            float rem_carrier_phase_in_rad,
+            float phase_step_rad,
+            const int *shifts_samples,
+            int signal_length_samples,
+            int n_correlators);
+    bool Carrier_wipeoff_multicorrelator_resampler_cuda(
+            std::complex<float>* corr_out,
+            const std::complex<float>* sig_in,
+            float rem_carrier_phase_in_rad,
+            float phase_step_rad,
+            float code_phase_step_chips,
+            float rem_code_phase_chips,
+            int signal_length_samples,
+            int n_correlators);
+
 private:
-	// Allocate the device input vectors
-	GPU_Complex *d_sig_in;
-	GPU_Complex *d_nco_in;
-	GPU_Complex *d_sig_doppler_wiped;
-	GPU_Complex *d_local_codes_in;
-	GPU_Complex *d_corr_out;
-	int *d_shifts_samples;
-	float *d_shifts_chips;
-	float d_code_length_chips;
+    // Allocate the device input vectors
+    GPU_Complex *d_sig_in;
+    GPU_Complex *d_nco_in;
+    GPU_Complex *d_sig_doppler_wiped;
+    GPU_Complex *d_local_codes_in;
+    GPU_Complex *d_corr_out;
+    int *d_shifts_samples;
+    float *d_shifts_chips;
+    float d_code_length_chips;

-	int threadsPerBlock;
-	int blocksPerGrid;
-
-	cudaStream_t stream1;
-	cudaStream_t stream2;
-	int num_gpu_devices;
-	int selected_device;
+    int threadsPerBlock;
+    int blocksPerGrid;

+    cudaStream_t stream1;
+    cudaStream_t stream2;
+    int num_gpu_devices;
+    int selected_device;
 };