Pinned memory (Zero copy) huge improvement for GPU tracking.

2025-10-31 15:23:04 +00:00 · 2015-10-15 19:09:09 +02:00
parent a4655e2b03
commit 2039e998ff
4 changed files with 202 additions and 417 deletions
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
@@ -128,13 +128,9 @@ private:

    //GPU HOST PINNED MEMORY IN/OUT VECTORS
    gr_complex* in_gpu;
-    gr_complex* d_carr_sign_gpu;
-    gr_complex* d_local_codes_gpu;
    float* d_local_code_shift_chips;
    gr_complex* d_corr_outs_gpu;
    cuda_multicorrelator *multicorrelator_gpu;
-
-
    gr_complex* d_ca_code;

    gr_complex *d_Early;
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -32,26 +32,14 @@
 * -------------------------------------------------------------------------
 */

-///////////////////////////////////////////////////////////////////////////////
-// On G80-class hardware 24-bit multiplication takes 4 clocks per warp
-// (the same as for floating point  multiplication and addition),
-// whereas full 32-bit multiplication takes 16 clocks per warp.
-// So if integer multiplication operands are  guaranteed to fit into 24 bits
-// (always lie withtin [-8M, 8M - 1] range in signed case),
-// explicit 24-bit multiplication is preferred for performance.
-///////////////////////////////////////////////////////////////////////////////
-#define IMUL(a, b) __mul24(a, b)
-
 #include "cuda_multicorrelator.h"

 #include <stdio.h>
-
+#include <iostream>
 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda_runtime.h>

-
-#define ACCUM_N 256
-
+#define ACCUM_N 128

 __global__ void scalarProdGPUCPXxN_shifts_chips(
    GPU_Complex *d_corr_out,
@@ -90,15 +78,17 @@ __global__ void scalarProdGPUCPXxN_shifts_chips(

            for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
            {
+            	//original sample code
                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
            	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);

+            	//custom code for multitap correlator
            	// 1.resample local code for the current shift
            	float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
-            	//TODO: Take into account that in multitap correlators, the shifts can be negative!
+            	//Take into account that in multitap correlators, the shifts can be negative!
            	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
-
+            	//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
            	// 2.correlate
            	sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);

@@ -127,163 +117,6 @@ __global__ void scalarProdGPUCPXxN_shifts_chips(
    }
 }

-
-///////////////////////////////////////////////////////////////////////////////
-// Calculate scalar products of VectorN vectors of ElementN elements on GPU
-// Parameters restrictions:
-// 1) ElementN is strongly preferred to be a multiple of warp size to
-//    meet alignment constraints of memory coalescing.
-// 2) ACCUM_N must be a power of two.
-///////////////////////////////////////////////////////////////////////////////
-
-
-__global__ void scalarProdGPUCPXxN_shifts(
-    GPU_Complex *d_corr_out,
-    GPU_Complex *d_sig_in,
-    GPU_Complex *d_local_codes_in,
-    int *d_shifts_samples,
-    int vectorN,
-    int elementN
-)
-{
-    //Accumulators cache
-    __shared__ GPU_Complex accumResult[ACCUM_N];
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Cycle through every pair of vectors,
-    // taking into account that vector counts can be different
-    // from total number of thread blocks
-    ////////////////////////////////////////////////////////////////////////////
-    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
-    {
-        int vectorBase = IMUL(elementN, vec);
-        int vectorEnd  = vectorBase + elementN;
-
-        ////////////////////////////////////////////////////////////////////////
-        // Each accumulator cycles through vectors with
-        // stride equal to number of total number of accumulators ACCUM_N
-        // At this stage ACCUM_N is only preferred be a multiple of warp size
-        // to meet memory coalescing alignment constraints.
-        ////////////////////////////////////////////////////////////////////////
-        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
-        {
-        	GPU_Complex sum = GPU_Complex(0,0);
-
-            for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
-            {
-                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
-            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
-            	sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos-vectorBase+d_shifts_samples[vec]]);
-            }
-            accumResult[iAccum] = sum;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // Perform tree-like reduction of accumulators' results.
-        // ACCUM_N has to be power of two at this stage
-        ////////////////////////////////////////////////////////////////////////
-        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
-        {
-            __syncthreads();
-
-            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
-            {
-                accumResult[iAccum] += accumResult[stride + iAccum];
-            }
-        }
-
-        if (threadIdx.x == 0)
-        	{
-        		d_corr_out[vec] = accumResult[0];
-        	}
-    }
-}
-
-
-__global__ void scalarProdGPUCPXxN(
-    GPU_Complex *d_corr_out,
-    GPU_Complex *d_sig_in,
-    GPU_Complex *d_local_codes_in,
-    int vectorN,
-    int elementN
-)
-{
-    //Accumulators cache
-    __shared__ GPU_Complex accumResult[ACCUM_N];
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Cycle through every pair of vectors,
-    // taking into account that vector counts can be different
-    // from total number of thread blocks
-    ////////////////////////////////////////////////////////////////////////////
-    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
-    {
-        //int vectorBase = IMUL(elementN, vec);
-        //int vectorEnd  = vectorBase + elementN;
-
-        ////////////////////////////////////////////////////////////////////////
-        // Each accumulator cycles through vectors with
-        // stride equal to number of total number of accumulators ACCUM_N
-        // At this stage ACCUM_N is only preferred be a multiple of warp size
-        // to meet memory coalescing alignment constraints.
-        ////////////////////////////////////////////////////////////////////////
-        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
-        {
-        	GPU_Complex sum = GPU_Complex(0,0);
-
-            //for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
-        	for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
-            {
-                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
-            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
-            	//sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]);
-        		sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos]);
-            }
-            accumResult[iAccum] = sum;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // Perform tree-like reduction of accumulators' results.
-        // ACCUM_N has to be power of two at this stage
-        ////////////////////////////////////////////////////////////////////////
-        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
-        {
-            __syncthreads();
-
-            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
-            {
-                accumResult[iAccum] += accumResult[stride + iAccum];
-            }
-        }
-
-        if (threadIdx.x == 0)
-        	{
-        		d_corr_out[vec] = accumResult[0];
-        	}
-    }
-}
-
-
-//*********** CUDA processing **************
-// Treads: a minimal parallel execution code on GPU
-// Blocks: a set of N threads
-/**
- * CUDA Kernel Device code
- *
- * Computes the vectorial product of A and B into C. The 3 vectors have the same
- * number of elements numElements.
- */
-__global__ void CUDA_32fc_x2_multiply_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements)
-{
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
-         i < numElements;
-         i += blockDim.x * gridDim.x)
-    {
-        C[i] =  A[i] * B[i];
-    }
-}
-
-
 /**
 * CUDA Kernel Device code
 *
@@ -292,21 +125,7 @@ __global__ void CUDA_32fc_x2_multiply_32fc(  GPU_Complex *A,   GPU_Complex  *B,
 __global__ void
 CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
 {
-	//*** NCO CPU code (GNURadio FXP NCO)
-	//float sin_f, cos_f;
-	//float phase_step_rad = static_cast<float>(2 * GALILEO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
-	//int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad);
-	//int phase_rad_i = gr::fxpt::float_to_fixed(d_rem_carr_phase_rad);
-	//
-	//for(int i = 0; i < d_current_prn_length_samples; i++)
-	//    {
-	//        gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f);
-	//        d_carr_sign[i] = std::complex<float>(cos_f, -sin_f);
-	//        phase_rad_i += phase_step_rad_i;
-	//    }
-
 	// CUDA version of floating point NCO and vector dot product integrated
-
    float sin;
    float cos;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -319,110 +138,101 @@ CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem
 }


-/**
- * CUDA Kernel Device code
- *
- * Computes the vectorial product of A and B into C. The 3 vectors have the same
- * number of elements numElements.
- */
-__global__ void
-CUDA_32fc_x2_add_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements)
+__global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
+    GPU_Complex *d_corr_out,
+    GPU_Complex *d_sig_in,
+    GPU_Complex *d_sig_wiped,
+    GPU_Complex *d_local_code_in,
+    float *d_shifts_chips,
+    float code_length_chips,
+    float code_phase_step_chips,
+    float rem_code_phase_chips,
+    int vectorN,
+    int elementN,
+    float rem_carrier_phase_in_rad,
+    float phase_step_rad
+)
 {
+    //Accumulators cache
+    __shared__ GPU_Complex accumResult[ACCUM_N];
+
+	// CUDA version of floating point NCO and vector dot product integrated
+    float sin;
+    float cos;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
-         i < numElements;
+         i < elementN;
         i += blockDim.x * gridDim.x)
    {
-        C[i] =  A[i] + B[i];
+    	__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
+    	d_sig_wiped[i] =  d_sig_in[i] * GPU_Complex(cos,-sin);
+    }
+
+    __syncthreads();
+    ////////////////////////////////////////////////////////////////////////////
+    // Cycle through every pair of vectors,
+    // taking into account that vector counts can be different
+    // from total number of thread blocks
+    ////////////////////////////////////////////////////////////////////////////
+    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+    {
+        //int vectorBase = IMUL(elementN, vec);
+        //int vectorEnd  = elementN;
+
+        ////////////////////////////////////////////////////////////////////////
+        // Each accumulator cycles through vectors with
+        // stride equal to number of total number of accumulators ACCUM_N
+        // At this stage ACCUM_N is only preferred be a multiple of warp size
+        // to meet memory coalescing alignment constraints.
+        ////////////////////////////////////////////////////////////////////////
+        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+        {
+        	GPU_Complex sum = GPU_Complex(0,0);
+            float local_code_chip_index;
+            //float code_phase;
+            for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
+            {
+            	//original sample code
+                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
+            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
+            	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
+
+            	//custom code for multitap correlator
+            	// 1.resample local code for the current shift
+
+            	local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
+
+            	//Take into account that in multitap correlators, the shifts can be negative!
+            	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
+            	//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
+            	// 2.correlate
+            	sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
+
+            }
+            accumResult[iAccum] = sum;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Perform tree-like reduction of accumulators' results.
+        // ACCUM_N has to be power of two at this stage
+        ////////////////////////////////////////////////////////////////////////
+        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+        {
+            __syncthreads();
+
+            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+            {
+                accumResult[iAccum] += accumResult[stride + iAccum];
+            }
+        }
+
+        if (threadIdx.x == 0)
+        	{
+        		d_corr_out[vec] = accumResult[0];
+        	}
    }
 }

-
-bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators)
-{
-	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
-//	findCudaDevice(argc, (const char **)argv);
-//      cudaDeviceProp  prop;
-//    int num_devices, device;
-//    cudaGetDeviceCount(&num_devices);
-//    if (num_devices > 1) {
-//          int max_multiprocessors = 0, max_device = 0;
-//          for (device = 0; device < num_devices; device++) {
-//                  cudaDeviceProp properties;
-//                  cudaGetDeviceProperties(&properties, device);
-//                  if (max_multiprocessors < properties.multiProcessorCount) {
-//                          max_multiprocessors = properties.multiProcessorCount;
-//                          max_device = device;
-//                  }
-//                  printf("Found GPU device # %i\n",device);
-//          }
-//          //cudaSetDevice(max_device);
-//
-//          //set random device!
-//          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs
-//
-//          cudaGetDeviceProperties( &prop, max_device );
-//          //debug code
-//          if (prop.canMapHostMemory != 1) {
-//              printf( "Device can not map memory.\n" );
-//          }
-//          printf("L2 Cache size= %u \n",prop.l2CacheSize);
-//          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
-//          printf("maxGridSize= %i \n",prop.maxGridSize[0]);
-//          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
-//          printf("deviceOverlap= %i \n",prop.deviceOverlap);
-//  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
-//    }else{
-//    	    int whichDevice;
-//    	    cudaGetDevice( &whichDevice );
-//    	    cudaGetDeviceProperties( &prop, whichDevice );
-//    	    //debug code
-//    	    if (prop.canMapHostMemory != 1) {
-//    	        printf( "Device can not map memory.\n" );
-//    	    }
-//
-//    	    printf("L2 Cache size= %u \n",prop.l2CacheSize);
-//    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
-//    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]);
-//    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
-//    	    printf("deviceOverlap= %i \n",prop.deviceOverlap);
-//    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
-//    }
-
-	// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
-
-
-    // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
-
-    size_t size = signal_length_samples * sizeof(GPU_Complex);
-
-	cudaMalloc((void **)&d_sig_in, size);
-	// (cudaMalloc((void **)&d_nco_in, size));
-	cudaMalloc((void **)&d_sig_doppler_wiped, size);
-
-	// old version: all local codes are independent vectors
-	// (cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
-
-	// new version: only one vector with extra samples to shift the local code for the correlator set
-	// Required: The last correlator tap in d_shifts_samples has the largest sample shift
-    size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex);
-	cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes);
-	cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators);
-
-	//scalars
-	cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
-
-    // Launch the Vector Add CUDA Kernel
-	threadsPerBlock = 256;
-    blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
-
-	cudaStreamCreate (&stream1) ;
-	cudaStreamCreate (&stream2) ;
-	return true;
-}
-
-
 bool cuda_multicorrelator::init_cuda_integrated_resampler(
-		const int argc, const char **argv,
 		int signal_length_samples,
 		int code_length_chips,
 		int n_correlators
@@ -480,34 +290,45 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler(
 	// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));

    // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
-
    size_t size = signal_length_samples * sizeof(GPU_Complex);

-	cudaMalloc((void **)&d_sig_in, size);
-	cudaMemset(d_sig_in,0,size);
+	//********* ZERO COPY VERSION ************
+	// Set flag to enable zero copy access
+    // Optimal in shared memory devices (like Jetson K1)
+	cudaSetDeviceFlags(cudaDeviceMapHost);

-	// (cudaMalloc((void **)&d_nco_in, size));
+	//******** CudaMalloc version ***********
+
+	// input signal GPU memory (can be mapped to CPU memory in shared memory devices!)
+	//	cudaMalloc((void **)&d_sig_in, size);
+	//	cudaMemset(d_sig_in,0,size);
+
+	// Doppler-free signal (internal GPU memory)
 	cudaMalloc((void **)&d_sig_doppler_wiped, size);
 	cudaMemset(d_sig_doppler_wiped,0,size);

+	// Local code GPU memory (can be mapped to CPU memory in shared memory devices!)
 	cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips);
 	cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips);

    d_code_length_chips=code_length_chips;

+	// Vector with the chip shifts for each correlator tap
+    //GPU memory (can be mapped to CPU memory in shared memory devices!)
 	cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators);
 	cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators);

 	//scalars
-	cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
-	cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
+	//cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
+	//cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);

    // Launch the Vector Add CUDA Kernel
-	threadsPerBlock = 256;
+    // TODO: write a smart load balance using device info!
+	threadsPerBlock = 64;
    blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;

 	cudaStreamCreate (&stream1) ;
-	cudaStreamCreate (&stream2) ;
+	//cudaStreamCreate (&stream2) ;
 	return true;
 }

@@ -518,103 +339,57 @@ bool cuda_multicorrelator::set_local_code_and_taps(
 		int n_correlators
 		)
 {
-    // local code CPU -> GPU copy memory
+	//********* ZERO COPY VERSION ************
+//	// Get device pointer from host memory. No allocation or memcpy
+//	cudaError_t code;
+//	// local code CPU -> GPU copy memory
+//	code=cudaHostGetDevicePointer((void **)&d_local_codes_in,  (void *) local_codes_in, 0);
+//	if (code!=cudaSuccess)
+//	{
+//		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
+//	}
+//	// Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
+//	code=cudaHostGetDevicePointer((void **)&d_shifts_chips,  (void *) shifts_chips, 0);
+//	if (code!=cudaSuccess)
+//	{
+//		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
+//	}
+
+	//******** CudaMalloc version ***********
+    //local code CPU -> GPU copy memory
    cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1);
    d_code_length_chips=(float)code_length_chips;

-    // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
+    //Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
    cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
                                    cudaMemcpyHostToDevice,stream1);

 	return true;
 }

-
-
-bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
+bool cuda_multicorrelator::set_input_output_vectors(
 		std::complex<float>* corr_out,
-		const std::complex<float>* sig_in,
-		const std::complex<float>* local_codes_in,
-		float rem_carrier_phase_in_rad,
-		float phase_step_rad,
-		const int *shifts_samples,
-		int signal_length_samples,
-		int n_correlators)
+		std::complex<float>* sig_in
+		)
+{
+
+	// Save CPU pointers
+	d_sig_in_cpu =sig_in;
+	d_corr_out_cpu = corr_out;
+
+	// Zero Copy version
+	// Get device pointer from host memory. No allocation or memcpy
+	cudaError_t code;
+	code=cudaHostGetDevicePointer((void **)&d_sig_in,  (void *) sig_in, 0);
+	code=cudaHostGetDevicePointer((void **)&d_corr_out,  (void *) corr_out, 0);
+	if (code!=cudaSuccess)
 	{
+		printf("cuda cudaHostGetDevicePointer error \r\n");
+	}
+	return true;

-	size_t memSize = signal_length_samples * sizeof(std::complex<float>);
-
-	// input signal CPU -> GPU copy memory
-
-    cudaMemcpyAsync(d_sig_in, sig_in, memSize,
-                                    cudaMemcpyHostToDevice, stream1);
-
-    //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
-    // (cudaMemcpyAsync(d_nco_in, nco_in, memSize,
-    //                                cudaMemcpyHostToDevice, stream1));
-
-
-	// old version: all local codes are independent vectors
-    // (cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
-    //                                cudaMemcpyHostToDevice, stream2));
-
-	// new version: only one vector with extra samples to shift the local code for the correlator set
-	// Required: The last correlator tap in d_shifts_samples has the largest sample shift
-
-    // local code CPU -> GPU copy memory
-    cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
-                                    cudaMemcpyHostToDevice, stream2);
-    // Correlator shifts vector CPU -> GPU copy memory
-    cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
-                                    cudaMemcpyHostToDevice, stream2);
-
-
-    //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
-    cudaStreamSynchronize(stream1);
-    CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
-
-
-    //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
-
-    //wait for Doppler wipeoff end...
-    cudaStreamSynchronize(stream1);
-    cudaStreamSynchronize(stream2);
-    // (cudaDeviceSynchronize());
-
-    //old
-//    scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
-//    		d_corr_out,
-//    		d_sig_doppler_wiped,
-//    		d_local_codes_in,
-//            3,
-//            signal_length_samples
-//        );
-
-    //new
-    //launch the multitap correlator
-    scalarProdGPUCPXxN_shifts<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
-			d_corr_out,
-			d_sig_doppler_wiped,
-			d_local_codes_in,
-			d_shifts_samples,
-			n_correlators,
-			signal_length_samples
-		);
-    cudaGetLastError();
-    //wait for correlators end...
-    cudaStreamSynchronize(stream2);
-    // Copy the device result vector in device memory to the host result vector
-    // in host memory.
-
-    //scalar products (correlators outputs)
-    cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
-            cudaMemcpyDeviceToHost);
-    return true;
 }
-
 bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
-		std::complex<float>* corr_out,
-		const std::complex<float>* sig_in,
 		float rem_carrier_phase_in_rad,
 		float phase_step_rad,
        float code_phase_step_chips,
@@ -623,26 +398,40 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
 		int n_correlators)
 	{

-	size_t memSize = signal_length_samples * sizeof(std::complex<float>);
+
+	// cudaMemCpy version
+	//size_t memSize = signal_length_samples * sizeof(std::complex<float>);
 	// input signal CPU -> GPU copy memory
-    cudaMemcpyAsync(d_sig_in, sig_in, memSize,
-                                    cudaMemcpyHostToDevice, stream2);
+    //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize,
+    //                               cudaMemcpyHostToDevice, stream2);

    //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
-
    //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
-    cudaStreamSynchronize(stream2);
+    //cudaStreamSynchronize(stream2);

-    CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
+    //CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);

    //wait for Doppler wipeoff end...
-    cudaStreamSynchronize(stream1);
-    cudaStreamSynchronize(stream2);
+    //cudaStreamSynchronize(stream1);
+    //cudaStreamSynchronize(stream2);

    //launch the multitap correlator with integrated local code resampler!

-    scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
+//    scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
+//			d_corr_out,
+//			d_sig_doppler_wiped,
+//			d_local_codes_in,
+//			d_shifts_chips,
+//			d_code_length_chips,
+//	        code_phase_step_chips,
+//	        rem_code_phase_chips,
+//			n_correlators,
+//			signal_length_samples
+//		);
+
+    Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
 			d_corr_out,
+			d_sig_in,
 			d_sig_doppler_wiped,
 			d_local_codes_in,
 			d_shifts_chips,
@@ -650,23 +439,33 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
 	        code_phase_step_chips,
 	        rem_code_phase_chips,
 			n_correlators,
-			signal_length_samples
-		);
+			signal_length_samples,
+			rem_carrier_phase_in_rad,
+			phase_step_rad
+			);

-    cudaGetLastError();
+    //debug
+//	std::complex<float>* debug_signal;
+//	debug_signal=static_cast<std::complex<float>*>(malloc(memSize));
+//    cudaMemcpyAsync(debug_signal, d_sig_doppler_wiped, memSize,
+//            cudaMemcpyDeviceToHost,stream1);
+//    cudaStreamSynchronize(stream1);
+//	std::cout<<"d_sig_doppler_wiped GPU="<<debug_signal[456]<<","<<debug_signal[1]<<","<<debug_signal[2]<<","<<debug_signal[3]<<std::endl;
+
+    //cudaGetLastError();
    //wait for correlators end...
-    cudaStreamSynchronize(stream1);
+    //cudaStreamSynchronize(stream1);
    // Copy the device result vector in device memory to the host result vector
    // in host memory.

    //scalar products (correlators outputs)
-    cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
-            cudaMemcpyDeviceToHost,stream1);
+    //cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+    //        cudaMemcpyDeviceToHost,stream1);
+
    cudaStreamSynchronize(stream1);
    return true;
 }

-
 cuda_multicorrelator::cuda_multicorrelator()
 {
 	d_sig_in=NULL;
@@ -689,22 +488,16 @@ bool cuda_multicorrelator::free_cuda()
 	if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped);
 	if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in);
 	if (d_corr_out!=NULL) cudaFree(d_corr_out);
-
-
 	if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples);
 	if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips);

-
-	cudaStreamDestroy(stream1) ;
-	cudaStreamDestroy(stream2) ;
-
    // Reset the device and exit
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
-	// (cudaDeviceReset());
+	cudaDeviceReset();
 	return true;
 }

--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -114,9 +114,7 @@ class cuda_multicorrelator
 {
 public:
    cuda_multicorrelator();
-    bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
    bool init_cuda_integrated_resampler(
-            const int argc, const char **argv,
            int signal_length_samples,
            int code_length_chips,
            int n_correlators
@@ -127,19 +125,12 @@ public:
            float *shifts_chips,
            int n_correlators
    );
+    bool set_input_output_vectors(
+    		std::complex<float>* corr_out,
+    		std::complex<float>* sig_in
+    		);
    bool free_cuda();
-    bool Carrier_wipeoff_multicorrelator_cuda(
-            std::complex<float>* corr_out,
-            const std::complex<float>* sig_in,
-            const std::complex<float>* local_codes_in,
-            float rem_carrier_phase_in_rad,
-            float phase_step_rad,
-            const int *shifts_samples,
-            int signal_length_samples,
-            int n_correlators);
    bool Carrier_wipeoff_multicorrelator_resampler_cuda(
-            std::complex<float>* corr_out,
-            const std::complex<float>* sig_in,
            float rem_carrier_phase_in_rad,
            float phase_step_rad,
            float code_phase_step_chips,
@@ -154,6 +145,11 @@ private:
    GPU_Complex *d_sig_doppler_wiped;
    GPU_Complex *d_local_codes_in;
    GPU_Complex *d_corr_out;
+
+    //
+    std::complex<float> *d_sig_in_cpu;
+    std::complex<float> *d_corr_out_cpu;
+
    int *d_shifts_samples;
    float *d_shifts_chips;
    float d_code_length_chips;
@@ -162,7 +158,7 @@ private:
    int blocksPerGrid;

    cudaStream_t stream1;
-    cudaStream_t stream2;
+    //cudaStream_t stream2;
    int num_gpu_devices;
    int selected_device;
 };