Some CUDA cleaning and documentation

2015-08-25 19:01:02 +02:00 · 2015-08-25 19:01:02 +02:00 · ef136e5c74
parent 429e4e8776
commit ef136e5c74
6 changed files with 154 additions and 138 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -47,6 +47,7 @@ option(ENABLE_GPROF "Enable the use of the GNU profiler tool 'gprof'" OFF)

 # Acceleration
 option(ENABLE_OPENCL "Enable building of processing blocks implemented with OpenCL (experimental)" OFF)
+option(ENABLE_CUDA "Enable building of processing blocks implemented with CUDA (experimental, requires CUDA SDK)" OFF)

 # Building and packaging options
 option(ENABLE_GENERIC_ARCH "Builds a portable binary" OFF)
@ -883,6 +884,24 @@ else(ENABLE_OPENCL)
 endif(ENABLE_OPENCL)


+###############################################################################
+# CUDA (OPTIONAL)
+###############################################################################
+if($ENV{CUDA_GPU_ACCEL})
+    message(STATUS "CUDA_GPU_ACCEL environment variable found." )
+    set(ENABLE_CUDA ON)
+endif($ENV{CUDA_GPU_ACCEL})
+
+if(ENABLE_CUDA)
+    FIND_PACKAGE(CUDA REQUIRED)
+    message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." )
+    message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" )
+else(ENABLE_CUDA)
+    message(STATUS "NVIDIA CUDA GPU Acceleration will be not enabled." )
+    message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for GPU-based acceleration using CUDA." )
+endif(ENABLE_CUDA)
+
+


 ################################################################################
@ -955,20 +974,6 @@ else(ENABLE_OSMOSDR)
    message(STATUS "Enable it with 'cmake -DENABLE_OSMOSDR=ON ../' to add support for OsmoSDR and other front-ends (HackRF, bladeRF, Realtek's RTL2832U-based USB dongles, etc.)" )
 endif(ENABLE_OSMOSDR)

-if($ENV{CUDA_GPU_ACCEL})
-    message(STATUS "CUDA_GPU_ACCEL environment variable found." )
-    set(ENABLE_CUDA ON)
-endif($ENV{CUDA_GPU_ACCEL})
-
-if(ENABLE_CUDA)
-    FIND_PACKAGE(CUDA REQUIRED)
-    message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." )
-    message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" )
-else(ENABLE_CUDA)
-    message(STATUS "NVIDIA CUDA GPU Acceleration will is not enabled." )
-    message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for GPU-based acceleration using CUDA." )
-endif(ENABLE_CUDA)
-

 if($ENV{FLEXIBAND_DRIVER})
    message(STATUS "FLEXIBAND_DRIVER environment variable found." )
--- a/README.md
+++ b/README.md
@ -345,6 +345,19 @@ $ sudo make install
 ~~~~~~ 


+###### Build CUDA support (OPTIONAL):
+
+In order to enable the building of blocks that use CUDA, NVIDIA's parallel programming model that enables graphics processing unit (GPU) acceleration for data-parallel computations, first you need to install the CUDA Toolkit from [NVIDIA Developers Download page](https://developer.nvidia.com/cuda-downloads "CUDA Downloads"). Then, build GNSS-SDR doing:
+
+~~~~~~ 
+$ cmake -DENABLE_CUDA=ON ../
+$ make
+$ sudo make install
+~~~~~~ 
+
+Of course, you will also need a GPU that [supports CUDA](https://developer.nvidia.com/cuda-gpus "CUDA GPUs").
+
+
 ###### Build a portable binary

 In order to build an executable that not depends on the specific SIMD instruction set that is present in the processor of the compiling machine, so other users can execute it in other machines without those particular sets, use:
@ -841,7 +854,6 @@ Tracking_1C.dump=false ; Enable internal binary data file logging [true] or [fal
 Tracking_1C.dump_filename=./tracking_ch_ ; Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number.
 Tracking_1C.pll_bw_hz=50.0 ; PLL loop filter bandwidth [Hz]
 Tracking_1C.dll_bw_hz=2.0 ; DLL loop filter bandwidth [Hz]
-Tracking_1C.fll_bw_hz=10.0 ; FLL loop filter bandwidth [Hz]
 Tracking_1C.order=3 ; PLL/DLL loop filter order [2] or [3]
 Tracking_1C.early_late_space_chips=0.5 ; correlator early-late space [chips]. 
 ~~~~~~ 
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
@ -51,7 +51,6 @@ class ConfigurationInterface;
 class GpsL1CaDllPllTrackingGPU : public TrackingInterface
 {
 public:
-
  GpsL1CaDllPllTrackingGPU(ConfigurationInterface* configuration,
            std::string role,
            unsigned int in_streams,
@ -65,7 +64,7 @@ public:
        return role_;
    }

-    //! Returns "GPS_L1_CA_DLL_PLL_Tracking"
+    //! Returns "GPS_L1_CA_DLL_PLL_Tracking_GPU"
    std::string implementation()
    {
        return "GPS_L1_CA_DLL_PLL_Tracking_GPU";
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
@ -123,7 +123,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment()));

    multicorrelator_gpu = new cuda_multicorrelator();
-    int N_CORRELATORS=3;
+    int N_CORRELATORS = 3;
    //local code resampler on CPU (old)
    //multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS);

--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@ -45,65 +45,65 @@
 #endif

 #include <complex>
-
 #include <cuda.h>
-// CUDA runtime
 #include <cuda_runtime.h>

 // GPU new internal data types for complex numbers

-struct GPU_Complex {
+struct GPU_Complex
+{
    float r;
    float i;
    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
-	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
-		 return r * r + i * i;
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) {
-		#ifdef __CUDACC__
-		 return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i));
-		#else
+    CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { return r * r + i * i; }
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a)
+    {
+#ifdef __CUDACC__
+        return GPU_Complex(__fmul_rn(r, a.r) - __fmul_rn(i, a.i), __fmul_rn(i, a.r) + __fmul_rn(r, a.i));
+#else
        return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
-		#endif
+#endif
    }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) {
-		 return GPU_Complex(r+a.r, i+a.i);
-	 }
-	 CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) {
-		 r+=a.r;
-		 i+=a.i;
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a)
+    {
+        return GPU_Complex(r + a.r, i + a.i);
    }
+    CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) { r += a.r; i += a.i; }
    CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
    {
        //c=a*b+c
        //real part
        //c.r=(a.r*b.r - a.i*b.i)+c.r
-		#ifdef __CUDACC__
-			 r=__fmaf_rn(a.r,b.r,r);
-			 r=__fmaf_rn(-a.i,b.i,r);
+#ifdef __CUDACC__
+        r = __fmaf_rn(a.r, b.r, r);
+        r = __fmaf_rn(-a.i, b.i, r);
        //imag part
-			 i=__fmaf_rn(a.i,b.r,i);
-			 i=__fmaf_rn(a.r,b.i,i);
-		#else
-			 r=(a.r*b.r - a.i*b.i)+r;
-			 i=(a.i*b.r - a.r*b.i)+i;
-		#endif
+        i = __fmaf_rn(a.i, b.r, i);
+        i = __fmaf_rn(a.r, b.i, i);
+#else
+        r = (a.r*b.r - a.i*b.i) + r;
+        i = (a.i*b.r - a.r*b.i) + i;
+#endif

    }
 };

-struct GPU_Complex_Short {
+struct GPU_Complex_Short
+{
    float r;
    float i;
    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
-	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
+    CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void )
+    {
        return r * r + i * i;
    }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) {
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a)
+    {
        return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
    }
-	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) {
+    CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a)
+    {
        return GPU_Complex_Short(r+a.r, i+a.i);
    }
 };
@ -146,6 +146,7 @@ public:
            float rem_code_phase_chips,
            int signal_length_samples,
            int n_correlators);
+
 private:
    // Allocate the device input vectors
    GPU_Complex *d_sig_in;
@ -164,7 +165,6 @@ private:
    cudaStream_t stream2;
    int num_gpu_devices;
    int selected_device;
-
 };