Fix typo

2025-07-15 08:23:03 +00:00 · 2019-03-19 20:39:23 +01:00 · 2019-03-19 20:39:23 +01:00 · 10d73da839
commit 10d73da839
parent c8d27eb97c
9 changed files with 257 additions and 251 deletions
--- a/src/algorithms/tracking/libs/cpu_multicorrelator.cc
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator.cc
@ -1,11 +1,11 @@
 /*!
 * \file cpu_multicorrelator.cc
- * \brief High optimized CPU vector multiTAP correlator class
+ * \brief Highly optimized CPU vector multiTAP correlator class
 * \authors <ul>
 *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
 *          </ul>
 *
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
 *
 * -------------------------------------------------------------------------
 *
--- a/src/algorithms/tracking/libs/cpu_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator.h
@ -65,4 +65,4 @@ private:
 };
-#endif /* CPU_MULTICORRELATOR_H_ */
+#endif /* GNSS_SDR_CPU_MULTICORRELATOR_H_ */
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
@ -1,11 +1,11 @@
 /*!
 * \file cpu_multicorrelator_16sc.cc
- * \brief High optimized CPU vector multiTAP correlator class
+ * \brief Highly optimized CPU vector multiTAP correlator class
 * \authors <ul>
 *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
 *          </ul>
 *
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
 *
 * -------------------------------------------------------------------------
 *
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h
@ -1,11 +1,11 @@
 /*!
 * \file cpu_multicorrelator_16sc.h
- * \brief High optimized CPU vector multiTAP correlator class for lv_16sc_t (short int complex)
+ * \brief Highly optimized CPU vector multiTAP correlator class for lv_16sc_t (short int complex)
 * \authors <ul>
 *          <li> Javier Arribas, 2016. jarribas(at)cttc.es
 *          </ul>
 *
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
 *
 * -------------------------------------------------------------------------
 *
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc
@ -6,7 +6,7 @@
 *          <li> Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com
 *          </ul>
 *
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
 *
 * -------------------------------------------------------------------------
 *
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h
@ -6,7 +6,7 @@
 *          <li> Cillian O'Driscoll, 2017, cillian.odriscoll(at)gmail.com
 *          </ul>
 *
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
 *
 * -------------------------------------------------------------------------
 *
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@ -1,11 +1,11 @@
 /*!
 * \file cuda_multicorrelator.cu
- * \brief High optimized CUDA GPU vector multiTAP correlator class
+ * \brief Highly optimized CUDA GPU vector multiTAP correlator class
 * \authors <ul>
 *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
 *          </ul>
 *
- * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
 *
 * -------------------------------------------------------------------------
 *
@ -33,9 +33,8 @@
 */
 #include "cuda_multicorrelator.h"
 #include <stdio.h>
 #include <iostream>
 #include <stdio.h>
 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda_runtime.h>
@ -53,22 +52,21 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
    int vectorN,
    int elementN,
    float rem_carrier_phase_in_rad,
-    float phase_step_rad
+    float phase_step_rad)
 )
 {
    //Accumulators cache
    __shared__ GPU_Complex accumResult[ACCUM_N];
-	// CUDA version of floating point NCO and vector dot product integrated
+    // CUDA version of floating point NCO and vector dot product integrated
    float sin;
    float cos;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < elementN;
         i += blockDim.x * gridDim.x)
-    {
+        {
-    	__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
+            __sincosf(rem_carrier_phase_in_rad + i * phase_step_rad, &sin, &cos);
-    	d_sig_wiped[i] =  d_sig_in[i] * GPU_Complex(cos,-sin);
+            d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos, -sin);
-    }
+        }
    __syncthreads();
    ////////////////////////////////////////////////////////////////////////////
@ -77,273 +75,279 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
    // from total number of thread blocks
    ////////////////////////////////////////////////////////////////////////////
    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
    {
        //int vectorBase = IMUL(elementN, vec);
        //int vectorEnd  = elementN;
        ////////////////////////////////////////////////////////////////////////
        // Each accumulator cycles through vectors with
        // stride equal to number of total number of accumulators ACCUM_N
        // At this stage ACCUM_N is only preferred be a multiple of warp size
        // to meet memory coalescing alignment constraints.
        ////////////////////////////////////////////////////////////////////////
        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
        {
-        	GPU_Complex sum = GPU_Complex(0,0);
+            //int vectorBase = IMUL(elementN, vec);
-            float local_code_chip_index=0.0;;
+            //int vectorEnd  = elementN;
            //float code_phase;
            for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
            {
            	//original sample code
                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
            	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
-            	//custom code for multitap correlator
+            ////////////////////////////////////////////////////////////////////////
-            	// 1.resample local code for the current shift
+            // Each accumulator cycles through vectors with
            // stride equal to number of total number of accumulators ACCUM_N
            // At this stage ACCUM_N is only preferred be a multiple of warp size
            // to meet memory coalescing alignment constraints.
            ////////////////////////////////////////////////////////////////////////
            for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
                {
                    GPU_Complex sum = GPU_Complex(0, 0);
                    float local_code_chip_index = 0.0;
                    ;
                    //float code_phase;
                    for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
                        {
                            //original sample code
                            //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
                            //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
                            //sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
-            	local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
+                            //custom code for multitap correlator
                            // 1.resample local code for the current shift
-            	//Take into account that in multitap correlators, the shifts can be negative!
+                            local_code_chip_index = fmodf(code_phase_step_chips * __int2float_rd(pos) + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
            	if (local_code_chip_index<0.0) local_code_chip_index+=(code_length_chips-1);
            	//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
            	// 2.correlate
            	sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
-            }
+                            //Take into account that in multitap correlators, the shifts can be negative!
-            accumResult[iAccum] = sum;
+                            if (local_code_chip_index < 0.0) local_code_chip_index += (code_length_chips - 1);
                            //printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
                            // 2.correlate
                            sum.multiply_acc(d_sig_wiped[pos], d_local_code_in[__float2int_rd(local_code_chip_index)]);
                        }
                    accumResult[iAccum] = sum;
                }
            ////////////////////////////////////////////////////////////////////////
            // Perform tree-like reduction of accumulators' results.
            // ACCUM_N has to be power of two at this stage
            ////////////////////////////////////////////////////////////////////////
            for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
                {
                    __syncthreads();
                    for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
                        {
                            accumResult[iAccum] += accumResult[stride + iAccum];
                        }
                }
            if (threadIdx.x == 0)
                {
                    d_corr_out[vec] = accumResult[0];
                }
        }
        ////////////////////////////////////////////////////////////////////////
        // Perform tree-like reduction of accumulators' results.
        // ACCUM_N has to be power of two at this stage
        ////////////////////////////////////////////////////////////////////////
        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
        {
            __syncthreads();
            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
            {
                accumResult[iAccum] += accumResult[stride + iAccum];
            }
        }
        if (threadIdx.x == 0)
        	{
        		d_corr_out[vec] = accumResult[0];
        	}
    }
 }
 bool cuda_multicorrelator::init_cuda_integrated_resampler(
-		int signal_length_samples,
+    int signal_length_samples,
-		int code_length_chips,
+    int code_length_chips,
-		int n_correlators
+    int n_correlators)
 		)
 {
-	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
+    // use command-line specified CUDA device, otherwise use device with highest Gflops/s
-//	findCudaDevice(argc, (const char **)argv);
+    //	findCudaDevice(argc, (const char **)argv);
-      cudaDeviceProp  prop;
+    cudaDeviceProp prop;
    int num_devices, device;
    cudaGetDeviceCount(&num_devices);
-    if (num_devices > 1) {
+    if (num_devices > 1)
-          int max_multiprocessors = 0, max_device = 0;
+        {
-          for (device = 0; device < num_devices; device++) {
+            int max_multiprocessors = 0, max_device = 0;
-                  cudaDeviceProp properties;
+            for (device = 0; device < num_devices; device++)
-                  cudaGetDeviceProperties(&properties, device);
+                {
-                  if (max_multiprocessors < properties.multiProcessorCount) {
+                    cudaDeviceProp properties;
-                          max_multiprocessors = properties.multiProcessorCount;
+                    cudaGetDeviceProperties(&properties, device);
-                          max_device = device;
+                    if (max_multiprocessors < properties.multiProcessorCount)
-                  }
+                        {
-                  printf("Found GPU device # %i\n",device);
+                            max_multiprocessors = properties.multiProcessorCount;
-          }
+                            max_device = device;
-          //cudaSetDevice(max_device);
+                        }
                    printf("Found GPU device # %i\n", device);
                }
            //cudaSetDevice(max_device);
-          //set random device!
+            //set random device!
-	  selected_gps_device=rand() % num_devices;//generates a random number between 0 and num_devices to split the threads between GPUs
+            selected_gps_device = rand() % num_devices;  //generates a random number between 0 and num_devices to split the threads between GPUs
-          cudaSetDevice(selected_gps_device); 
+            cudaSetDevice(selected_gps_device);
-          cudaGetDeviceProperties( &prop, max_device );
+            cudaGetDeviceProperties(&prop, max_device);
-          //debug code
+            //debug code
-          if (prop.canMapHostMemory != 1) {
+            if (prop.canMapHostMemory != 1)
-              printf( "Device can not map memory.\n" );
+                {
-          }
+                    printf("Device can not map memory.\n");
-          printf("L2 Cache size= %u \n",prop.l2CacheSize);
+                }
-          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+            printf("L2 Cache size= %u \n", prop.l2CacheSize);
-          printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+            printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock);
-          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+            printf("maxGridSize= %i \n", prop.maxGridSize[0]);
-          printf("deviceOverlap= %i \n",prop.deviceOverlap);
+            printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock);
-  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+            printf("deviceOverlap= %i \n", prop.deviceOverlap);
-    }else{
+            printf("multiProcessorCount= %i \n", prop.multiProcessorCount);
-    	    cudaGetDevice( &selected_gps_device);
+        }
-    	    cudaGetDeviceProperties( &prop, selected_gps_device );
+    else
-    	    //debug code
+        {
-    	    if (prop.canMapHostMemory != 1) {
+            cudaGetDevice(&selected_gps_device);
-    	        printf( "Device can not map memory.\n" );
+            cudaGetDeviceProperties(&prop, selected_gps_device);
-    	    }
+            //debug code
            if (prop.canMapHostMemory != 1)
                {
                    printf("Device can not map memory.\n");
                }
-    	    printf("L2 Cache size= %u \n",prop.l2CacheSize);
+            printf("L2 Cache size= %u \n", prop.l2CacheSize);
-    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+            printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock);
-    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+            printf("maxGridSize= %i \n", prop.maxGridSize[0]);
-    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+            printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock);
-    	    printf("deviceOverlap= %i \n",prop.deviceOverlap);
+            printf("deviceOverlap= %i \n", prop.deviceOverlap);
-    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+            printf("multiProcessorCount= %i \n", prop.multiProcessorCount);
-    }
+        }
-	// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
+    // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
    // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
    size_t size = signal_length_samples * sizeof(GPU_Complex);
-	//********* ZERO COPY VERSION ************
+    //********* ZERO COPY VERSION ************
-	// Set flag to enable zero copy access
+    // Set flag to enable zero copy access
    // Optimal in shared memory devices (like Jetson K1)
-	//cudaSetDeviceFlags(cudaDeviceMapHost);
+    //cudaSetDeviceFlags(cudaDeviceMapHost);
-	//******** CudaMalloc version ***********
+    //******** CudaMalloc version ***********
-	// input signal GPU memory (can be mapped to CPU memory in shared memory devices!)
+    // input signal GPU memory (can be mapped to CPU memory in shared memory devices!)
-	//	cudaMalloc((void **)&d_sig_in, size);
+    //	cudaMalloc((void **)&d_sig_in, size);
-	//	cudaMemset(d_sig_in,0,size);
+    //	cudaMemset(d_sig_in,0,size);
-	// Doppler-free signal (internal GPU memory)
+    // Doppler-free signal (internal GPU memory)
-	cudaMalloc((void **)&d_sig_doppler_wiped, size);
+    cudaMalloc((void **)&d_sig_doppler_wiped, size);
-	cudaMemset(d_sig_doppler_wiped,0,size);
+    cudaMemset(d_sig_doppler_wiped, 0, size);
-	// Local code GPU memory (can be mapped to CPU memory in shared memory devices!)
+    // Local code GPU memory (can be mapped to CPU memory in shared memory devices!)
-	cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips);
+    cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>) * code_length_chips);
-	cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips);
+    cudaMemset(d_local_codes_in, 0, sizeof(std::complex<float>) * code_length_chips);
-    d_code_length_chips=code_length_chips;
+    d_code_length_chips = code_length_chips;
-	// Vector with the chip shifts for each correlator tap
+    // Vector with the chip shifts for each correlator tap
    //GPU memory (can be mapped to CPU memory in shared memory devices!)
-	cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators);
+    cudaMalloc((void **)&d_shifts_chips, sizeof(float) * n_correlators);
-	cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators);
+    cudaMemset(d_shifts_chips, 0, sizeof(float) * n_correlators);
-	//scalars
+    //scalars
-	//cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
+    //cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
-	//cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
+    //cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
    // Launch the Vector Add CUDA Kernel
    // TODO: write a smart load balance using device info!
-	threadsPerBlock = 64;
+    threadsPerBlock = 64;
-    blocksPerGrid = 128;//(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
+    blocksPerGrid = 128;  //(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
-	cudaStreamCreate (&stream1) ;
+    cudaStreamCreate(&stream1);
-	//cudaStreamCreate (&stream2) ;
+    //cudaStreamCreate (&stream2) ;
-	return true;
+    return true;
 }
 bool cuda_multicorrelator::set_local_code_and_taps(
-		int code_length_chips,
+    int code_length_chips,
-		const std::complex<float>* local_codes_in,
+    const std::complex<float> *local_codes_in,
-		float *shifts_chips,
+    float *shifts_chips,
-		int n_correlators
+    int n_correlators)
 		)
 {
    cudaSetDevice(selected_gps_device);
    //********* ZERO COPY VERSION ************
    //	// Get device pointer from host memory. No allocation or memcpy
    //	cudaError_t code;
    //	// local code CPU -> GPU copy memory
    //	code=cudaHostGetDevicePointer((void **)&d_local_codes_in,  (void *) local_codes_in, 0);
    //	if (code!=cudaSuccess)
    //	{
    //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
    //	}
    //	// Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
    //	code=cudaHostGetDevicePointer((void **)&d_shifts_chips,  (void *) shifts_chips, 0);
    //	if (code!=cudaSuccess)
    //	{
    //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
    //	}
-          cudaSetDevice(selected_gps_device);
+    //******** CudaMalloc version ***********
 	//********* ZERO COPY VERSION ************
 //	// Get device pointer from host memory. No allocation or memcpy
 //	cudaError_t code;
 //	// local code CPU -> GPU copy memory
 //	code=cudaHostGetDevicePointer((void **)&d_local_codes_in,  (void *) local_codes_in, 0);
 //	if (code!=cudaSuccess)
 //	{
 //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
 //	}
 //	// Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
 //	code=cudaHostGetDevicePointer((void **)&d_shifts_chips,  (void *) shifts_chips, 0);
 //	if (code!=cudaSuccess)
 //	{
 //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
 //	}
 	//******** CudaMalloc version ***********
    //local code CPU -> GPU copy memory
-    cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1);
+    cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex) * code_length_chips, cudaMemcpyHostToDevice, stream1);
-    d_code_length_chips=code_length_chips;
+    d_code_length_chips = code_length_chips;
    //Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
-    cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
+    cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float) * n_correlators,
-                                    cudaMemcpyHostToDevice,stream1);
+        cudaMemcpyHostToDevice, stream1);
-	return true;
+    return true;
 }
 bool cuda_multicorrelator::set_input_output_vectors(
-		std::complex<float>* corr_out,
+    std::complex<float> *corr_out,
-		std::complex<float>* sig_in
+    std::complex<float> *sig_in)
 		)
 {
    cudaSetDevice(selected_gps_device);
    // Save CPU pointers
    d_sig_in_cpu = sig_in;
    d_corr_out_cpu = corr_out;
-         cudaSetDevice(selected_gps_device);
+    // Zero Copy version
-	// Save CPU pointers
+    // Get device pointer from host memory. No allocation or memcpy
-	d_sig_in_cpu =sig_in;
+    cudaError_t code;
-	d_corr_out_cpu = corr_out;
+    code = cudaHostGetDevicePointer((void **)&d_sig_in, (void *)sig_in, 0);
-
+    code = cudaHostGetDevicePointer((void **)&d_corr_out, (void *)corr_out, 0);
-	// Zero Copy version
+    if (code != cudaSuccess)
-	// Get device pointer from host memory. No allocation or memcpy
+        {
-	cudaError_t code;
+            printf("cuda cudaHostGetDevicePointer error \r\n");
-	code=cudaHostGetDevicePointer((void **)&d_sig_in,  (void *) sig_in, 0);
+        }
-	code=cudaHostGetDevicePointer((void **)&d_corr_out,  (void *) corr_out, 0);
+    return true;
 	if (code!=cudaSuccess)
 	{
 		printf("cuda cudaHostGetDevicePointer error \r\n");
 	}
 	return true;
 }
-#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+#define gpuErrchk(ans)                        \
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+    {                                         \
        gpuAssert((ans), __FILE__, __LINE__); \
    }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
 {
-   if (code != cudaSuccess)
+    if (code != cudaSuccess)
-   {
+        {
-      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+            fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-      if (abort) exit(code);
+            if (abort) exit(code);
-   }
+        }
 }
 bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
-		float rem_carrier_phase_in_rad,
+    float rem_carrier_phase_in_rad,
-		float phase_step_rad,
+    float phase_step_rad,
-        float code_phase_step_chips,
+    float code_phase_step_chips,
-        float rem_code_phase_chips,
+    float rem_code_phase_chips,
-		int signal_length_samples,
+    int signal_length_samples,
-		int n_correlators)
+    int n_correlators)
-	{
+{
-
+    cudaSetDevice(selected_gps_device);
-    cudaSetDevice(selected_gps_device); 
+    // cudaMemCpy version
-	// cudaMemCpy version
+    //size_t memSize = signal_length_samples * sizeof(std::complex<float>);
-	//size_t memSize = signal_length_samples * sizeof(std::complex<float>);
+    // input signal CPU -> GPU copy memory
 	// input signal CPU -> GPU copy memory
    //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize,
    //                               cudaMemcpyHostToDevice, stream2);
    //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
    //launch the multitap correlator with integrated local code resampler!
-    Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
+    Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
-			d_corr_out,
+        d_corr_out,
-			d_sig_in,
+        d_sig_in,
-			d_sig_doppler_wiped,
+        d_sig_doppler_wiped,
-			d_local_codes_in,
+        d_local_codes_in,
-			d_shifts_chips,
+        d_shifts_chips,
-			d_code_length_chips,
+        d_code_length_chips,
-	        code_phase_step_chips,
+        code_phase_step_chips,
-	        rem_code_phase_chips,
+        rem_code_phase_chips,
-			n_correlators,
+        n_correlators,
-			signal_length_samples,
+        signal_length_samples,
-			rem_carrier_phase_in_rad,
+        rem_carrier_phase_in_rad,
-			phase_step_rad
+        phase_step_rad);
 			);
-    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk(cudaPeekAtLastError());
-    gpuErrchk( cudaStreamSynchronize(stream1));
+    gpuErrchk(cudaStreamSynchronize(stream1));
-	// cudaMemCpy version
+    // cudaMemCpy version
    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    //scalar products (correlators outputs)
@ -352,37 +356,38 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
    return true;
 }
 cuda_multicorrelator::cuda_multicorrelator()
 {
-	d_sig_in=NULL;
+    d_sig_in = NULL;
-	d_nco_in=NULL;
+    d_nco_in = NULL;
-	d_sig_doppler_wiped=NULL;
+    d_sig_doppler_wiped = NULL;
-	d_local_codes_in=NULL;
+    d_local_codes_in = NULL;
-	d_shifts_samples=NULL;
+    d_shifts_samples = NULL;
-	d_shifts_chips=NULL;
+    d_shifts_chips = NULL;
-	d_corr_out=NULL;
+    d_corr_out = NULL;
-	threadsPerBlock=0;
+    threadsPerBlock = 0;
-	blocksPerGrid=0;
+    blocksPerGrid = 0;
-	d_code_length_chips=0;
+    d_code_length_chips = 0;
 }
 bool cuda_multicorrelator::free_cuda()
 {
-	// Free device global memory
+    // Free device global memory
-	if (d_sig_in!=NULL) cudaFree(d_sig_in);
+    if (d_sig_in != NULL) cudaFree(d_sig_in);
-	if (d_nco_in!=NULL) cudaFree(d_nco_in);
+    if (d_nco_in != NULL) cudaFree(d_nco_in);
-	if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped);
+    if (d_sig_doppler_wiped != NULL) cudaFree(d_sig_doppler_wiped);
-	if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in);
+    if (d_local_codes_in != NULL) cudaFree(d_local_codes_in);
-	if (d_corr_out!=NULL) cudaFree(d_corr_out);
+    if (d_corr_out != NULL) cudaFree(d_corr_out);
-	if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples);
+    if (d_shifts_samples != NULL) cudaFree(d_shifts_samples);
-	if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips);
+    if (d_shifts_chips != NULL) cudaFree(d_shifts_chips);
    // Reset the device and exit
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
-	cudaDeviceReset();
+    cudaDeviceReset();
-	return true;
+    return true;
 }
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@ -1,11 +1,11 @@
 /*!
 * \file cuda_multicorrelator.h
- * \brief High optimized CUDA GPU vector multiTAP correlator class
+ * \brief Highly optimized CUDA GPU vector multiTAP correlator class
 * \authors <ul>
 *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
 *          </ul>
 *
- * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
 *
 * -------------------------------------------------------------------------
 *
@ -92,6 +92,7 @@ struct GPU_Complex
    }
 };
 struct GPU_Complex_Short
 {
    float r;
@ -149,7 +150,6 @@ private:
    GPU_Complex* d_local_codes_in;
    GPU_Complex* d_corr_out;
    //
    std::complex<float>* d_sig_in_cpu;
    std::complex<float>* d_corr_out_cpu;
--- a/src/algorithms/tracking/libs/tracking_discriminators.cc
+++ b/src/algorithms/tracking/libs/tracking_discriminators.cc
@ -36,6 +36,7 @@
 #include <cmath>
 //  All the outputs are in RADIANS
 /*
 * FLL four quadrant arctan discriminator:
 * \f{equation}
@ -45,7 +46,6 @@
 * \f$I_{PS1},Q_{PS1}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_1\f$, and
 * \f$I_{PS2},Q_{PS2}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_2\f$. The output is in [radians/second].
 */
 double fll_four_quadrant_atan(gr_complex prompt_s1, gr_complex prompt_s2, double t1, double t2)
 {
    double cross, dot;
@ -105,6 +105,7 @@ double dll_nc_e_minus_l_normalized(gr_complex early_s1, gr_complex late_s1)
    return 0.5 * (P_early - P_late) / (P_early + P_late);
 }
 /*
 * DLL Noncoherent Very Early Minus Late Power (VEMLP) normalized discriminator, using the outputs
 * of four correlators, Very Early (VE), Early (E), Late (L) and Very Late (VL):
`@ -65,4 +65,4 @@ private:`
	`};`	`};`


	`#endif /* CPU_MULTICORRELATOR_H_ */`	`#endif /* GNSS_SDR_CPU_MULTICORRELATOR_H_ */`