mirror of
				https://github.com/gnss-sdr/gnss-sdr
				synced 2025-10-31 15:23:04 +00:00 
			
		
		
		
	Pinned memory (Zero copy) huge improvement for GPU tracking.
This commit is contained in:
		| @@ -128,13 +128,9 @@ private: | ||||
|  | ||||
|     //GPU HOST PINNED MEMORY IN/OUT VECTORS | ||||
|     gr_complex* in_gpu; | ||||
|     gr_complex* d_carr_sign_gpu; | ||||
|     gr_complex* d_local_codes_gpu; | ||||
|     float* d_local_code_shift_chips; | ||||
|     gr_complex* d_corr_outs_gpu; | ||||
|     cuda_multicorrelator *multicorrelator_gpu; | ||||
|  | ||||
|  | ||||
|     gr_complex* d_ca_code; | ||||
|  | ||||
|     gr_complex *d_Early; | ||||
|   | ||||
| @@ -32,26 +32,14 @@ | ||||
|  * ------------------------------------------------------------------------- | ||||
|  */ | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////////////////// | ||||
| // On G80-class hardware 24-bit multiplication takes 4 clocks per warp | ||||
| // (the same as for floating point  multiplication and addition), | ||||
| // whereas full 32-bit multiplication takes 16 clocks per warp. | ||||
| // So if integer multiplication operands are  guaranteed to fit into 24 bits | ||||
| // (always lie withtin [-8M, 8M - 1] range in signed case), | ||||
| // explicit 24-bit multiplication is preferred for performance. | ||||
| /////////////////////////////////////////////////////////////////////////////// | ||||
| #define IMUL(a, b) __mul24(a, b) | ||||
|  | ||||
| #include "cuda_multicorrelator.h" | ||||
|  | ||||
| #include <stdio.h> | ||||
|  | ||||
| #include <iostream> | ||||
| // For the CUDA runtime routines (prefixed with "cuda_") | ||||
| #include <cuda_runtime.h> | ||||
|  | ||||
|  | ||||
| #define ACCUM_N 256 | ||||
|  | ||||
| #define ACCUM_N 128 | ||||
|  | ||||
| __global__ void scalarProdGPUCPXxN_shifts_chips( | ||||
|     GPU_Complex *d_corr_out, | ||||
| @@ -90,15 +78,17 @@ __global__ void scalarProdGPUCPXxN_shifts_chips( | ||||
|  | ||||
|             for (int pos = iAccum; pos < elementN; pos += ACCUM_N) | ||||
|             { | ||||
|             	//original sample code | ||||
|                 //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]); | ||||
|  | ||||
|             	//custom code for multitap correlator | ||||
|             	// 1.resample local code for the current shift | ||||
|             	float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); | ||||
|             	//TODO: Take into account that in multitap correlators, the shifts can be negative! | ||||
|             	//Take into account that in multitap correlators, the shifts can be negative! | ||||
|             	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips; | ||||
|  | ||||
|             	//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index); | ||||
|             	// 2.correlate | ||||
|             	sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]); | ||||
|  | ||||
| @@ -127,163 +117,6 @@ __global__ void scalarProdGPUCPXxN_shifts_chips( | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| /////////////////////////////////////////////////////////////////////////////// | ||||
| // Calculate scalar products of VectorN vectors of ElementN elements on GPU | ||||
| // Parameters restrictions: | ||||
| // 1) ElementN is strongly preferred to be a multiple of warp size to | ||||
| //    meet alignment constraints of memory coalescing. | ||||
| // 2) ACCUM_N must be a power of two. | ||||
| /////////////////////////////////////////////////////////////////////////////// | ||||
|  | ||||
|  | ||||
| __global__ void scalarProdGPUCPXxN_shifts( | ||||
|     GPU_Complex *d_corr_out, | ||||
|     GPU_Complex *d_sig_in, | ||||
|     GPU_Complex *d_local_codes_in, | ||||
|     int *d_shifts_samples, | ||||
|     int vectorN, | ||||
|     int elementN | ||||
| ) | ||||
| { | ||||
|     //Accumulators cache | ||||
|     __shared__ GPU_Complex accumResult[ACCUM_N]; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     // Cycle through every pair of vectors, | ||||
|     // taking into account that vector counts can be different | ||||
|     // from total number of thread blocks | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) | ||||
|     { | ||||
|         int vectorBase = IMUL(elementN, vec); | ||||
|         int vectorEnd  = vectorBase + elementN; | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Each accumulator cycles through vectors with | ||||
|         // stride equal to number of total number of accumulators ACCUM_N | ||||
|         // At this stage ACCUM_N is only preferred be a multiple of warp size | ||||
|         // to meet memory coalescing alignment constraints. | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) | ||||
|         { | ||||
|         	GPU_Complex sum = GPU_Complex(0,0); | ||||
|  | ||||
|             for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) | ||||
|             { | ||||
|                 //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos-vectorBase+d_shifts_samples[vec]]); | ||||
|             } | ||||
|             accumResult[iAccum] = sum; | ||||
|         } | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Perform tree-like reduction of accumulators' results. | ||||
|         // ACCUM_N has to be power of two at this stage | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) | ||||
|         { | ||||
|             __syncthreads(); | ||||
|  | ||||
|             for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) | ||||
|             { | ||||
|                 accumResult[iAccum] += accumResult[stride + iAccum]; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if (threadIdx.x == 0) | ||||
|         	{ | ||||
|         		d_corr_out[vec] = accumResult[0]; | ||||
|         	} | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| __global__ void scalarProdGPUCPXxN( | ||||
|     GPU_Complex *d_corr_out, | ||||
|     GPU_Complex *d_sig_in, | ||||
|     GPU_Complex *d_local_codes_in, | ||||
|     int vectorN, | ||||
|     int elementN | ||||
| ) | ||||
| { | ||||
|     //Accumulators cache | ||||
|     __shared__ GPU_Complex accumResult[ACCUM_N]; | ||||
|  | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     // Cycle through every pair of vectors, | ||||
|     // taking into account that vector counts can be different | ||||
|     // from total number of thread blocks | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) | ||||
|     { | ||||
|         //int vectorBase = IMUL(elementN, vec); | ||||
|         //int vectorEnd  = vectorBase + elementN; | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Each accumulator cycles through vectors with | ||||
|         // stride equal to number of total number of accumulators ACCUM_N | ||||
|         // At this stage ACCUM_N is only preferred be a multiple of warp size | ||||
|         // to meet memory coalescing alignment constraints. | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) | ||||
|         { | ||||
|         	GPU_Complex sum = GPU_Complex(0,0); | ||||
|  | ||||
|             //for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) | ||||
|         	for (int pos = iAccum; pos < elementN; pos += ACCUM_N) | ||||
|             { | ||||
|                 //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]); | ||||
|         		sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos]); | ||||
|             } | ||||
|             accumResult[iAccum] = sum; | ||||
|         } | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Perform tree-like reduction of accumulators' results. | ||||
|         // ACCUM_N has to be power of two at this stage | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) | ||||
|         { | ||||
|             __syncthreads(); | ||||
|  | ||||
|             for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) | ||||
|             { | ||||
|                 accumResult[iAccum] += accumResult[stride + iAccum]; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if (threadIdx.x == 0) | ||||
|         	{ | ||||
|         		d_corr_out[vec] = accumResult[0]; | ||||
|         	} | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| //*********** CUDA processing ************** | ||||
| // Treads: a minimal parallel execution code on GPU | ||||
| // Blocks: a set of N threads | ||||
| /** | ||||
|  * CUDA Kernel Device code | ||||
|  * | ||||
|  * Computes the vectorial product of A and B into C. The 3 vectors have the same | ||||
|  * number of elements numElements. | ||||
|  */ | ||||
| __global__ void CUDA_32fc_x2_multiply_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements) | ||||
| { | ||||
|     for (int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|          i < numElements; | ||||
|          i += blockDim.x * gridDim.x) | ||||
|     { | ||||
|         C[i] =  A[i] * B[i]; | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| /** | ||||
|  * CUDA Kernel Device code | ||||
|  * | ||||
| @@ -292,21 +125,7 @@ __global__ void CUDA_32fc_x2_multiply_32fc(  GPU_Complex *A,   GPU_Complex  *B, | ||||
| __global__ void | ||||
| CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements) | ||||
| { | ||||
| 	//*** NCO CPU code (GNURadio FXP NCO) | ||||
| 	//float sin_f, cos_f; | ||||
| 	//float phase_step_rad = static_cast<float>(2 * GALILEO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in); | ||||
| 	//int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad); | ||||
| 	//int phase_rad_i = gr::fxpt::float_to_fixed(d_rem_carr_phase_rad); | ||||
| 	// | ||||
| 	//for(int i = 0; i < d_current_prn_length_samples; i++) | ||||
| 	//    { | ||||
| 	//        gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f); | ||||
| 	//        d_carr_sign[i] = std::complex<float>(cos_f, -sin_f); | ||||
| 	//        phase_rad_i += phase_step_rad_i; | ||||
| 	//    } | ||||
|  | ||||
| 	// CUDA version of floating point NCO and vector dot product integrated | ||||
|  | ||||
|     float sin; | ||||
|     float cos; | ||||
|     for (int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
| @@ -319,110 +138,101 @@ CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem | ||||
| } | ||||
|  | ||||
|  | ||||
| /** | ||||
|  * CUDA Kernel Device code | ||||
|  * | ||||
|  * Computes the vectorial product of A and B into C. The 3 vectors have the same | ||||
|  * number of elements numElements. | ||||
|  */ | ||||
| __global__ void | ||||
| CUDA_32fc_x2_add_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements) | ||||
| __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips( | ||||
|     GPU_Complex *d_corr_out, | ||||
|     GPU_Complex *d_sig_in, | ||||
|     GPU_Complex *d_sig_wiped, | ||||
|     GPU_Complex *d_local_code_in, | ||||
|     float *d_shifts_chips, | ||||
|     float code_length_chips, | ||||
|     float code_phase_step_chips, | ||||
|     float rem_code_phase_chips, | ||||
|     int vectorN, | ||||
|     int elementN, | ||||
|     float rem_carrier_phase_in_rad, | ||||
|     float phase_step_rad | ||||
| ) | ||||
| { | ||||
|     //Accumulators cache | ||||
|     __shared__ GPU_Complex accumResult[ACCUM_N]; | ||||
|  | ||||
| 	// CUDA version of floating point NCO and vector dot product integrated | ||||
|     float sin; | ||||
|     float cos; | ||||
|     for (int i = blockIdx.x * blockDim.x + threadIdx.x; | ||||
|          i < numElements; | ||||
|          i < elementN; | ||||
|          i += blockDim.x * gridDim.x) | ||||
|     { | ||||
|         C[i] =  A[i] + B[i]; | ||||
|     	__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos); | ||||
|     	d_sig_wiped[i] =  d_sig_in[i] * GPU_Complex(cos,-sin); | ||||
|     } | ||||
|  | ||||
|     __syncthreads(); | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     // Cycle through every pair of vectors, | ||||
|     // taking into account that vector counts can be different | ||||
|     // from total number of thread blocks | ||||
|     //////////////////////////////////////////////////////////////////////////// | ||||
|     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) | ||||
|     { | ||||
|         //int vectorBase = IMUL(elementN, vec); | ||||
|         //int vectorEnd  = elementN; | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Each accumulator cycles through vectors with | ||||
|         // stride equal to number of total number of accumulators ACCUM_N | ||||
|         // At this stage ACCUM_N is only preferred be a multiple of warp size | ||||
|         // to meet memory coalescing alignment constraints. | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) | ||||
|         { | ||||
|         	GPU_Complex sum = GPU_Complex(0,0); | ||||
|             float local_code_chip_index; | ||||
|             //float code_phase; | ||||
|             for (int pos = iAccum; pos < elementN; pos += ACCUM_N) | ||||
|             { | ||||
|             	//original sample code | ||||
|                 //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; | ||||
|             	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]); | ||||
|  | ||||
|             	//custom code for multitap correlator | ||||
|             	// 1.resample local code for the current shift | ||||
|  | ||||
|             	local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); | ||||
|  | ||||
|             	//Take into account that in multitap correlators, the shifts can be negative! | ||||
|             	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips; | ||||
|             	//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index); | ||||
|             	// 2.correlate | ||||
|             	sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]); | ||||
|  | ||||
|             } | ||||
|             accumResult[iAccum] = sum; | ||||
|         } | ||||
|  | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         // Perform tree-like reduction of accumulators' results. | ||||
|         // ACCUM_N has to be power of two at this stage | ||||
|         //////////////////////////////////////////////////////////////////////// | ||||
|         for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) | ||||
|         { | ||||
|             __syncthreads(); | ||||
|  | ||||
|             for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) | ||||
|             { | ||||
|                 accumResult[iAccum] += accumResult[stride + iAccum]; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if (threadIdx.x == 0) | ||||
|         	{ | ||||
|         		d_corr_out[vec] = accumResult[0]; | ||||
|         	} | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators) | ||||
| { | ||||
| 	// use command-line specified CUDA device, otherwise use device with highest Gflops/s | ||||
| //	findCudaDevice(argc, (const char **)argv); | ||||
| //      cudaDeviceProp  prop; | ||||
| //    int num_devices, device; | ||||
| //    cudaGetDeviceCount(&num_devices); | ||||
| //    if (num_devices > 1) { | ||||
| //          int max_multiprocessors = 0, max_device = 0; | ||||
| //          for (device = 0; device < num_devices; device++) { | ||||
| //                  cudaDeviceProp properties; | ||||
| //                  cudaGetDeviceProperties(&properties, device); | ||||
| //                  if (max_multiprocessors < properties.multiProcessorCount) { | ||||
| //                          max_multiprocessors = properties.multiProcessorCount; | ||||
| //                          max_device = device; | ||||
| //                  } | ||||
| //                  printf("Found GPU device # %i\n",device); | ||||
| //          } | ||||
| //          //cudaSetDevice(max_device); | ||||
| // | ||||
| //          //set random device! | ||||
| //          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs | ||||
| // | ||||
| //          cudaGetDeviceProperties( &prop, max_device ); | ||||
| //          //debug code | ||||
| //          if (prop.canMapHostMemory != 1) { | ||||
| //              printf( "Device can not map memory.\n" ); | ||||
| //          } | ||||
| //          printf("L2 Cache size= %u \n",prop.l2CacheSize); | ||||
| //          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); | ||||
| //          printf("maxGridSize= %i \n",prop.maxGridSize[0]); | ||||
| //          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); | ||||
| //          printf("deviceOverlap= %i \n",prop.deviceOverlap); | ||||
| //  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount); | ||||
| //    }else{ | ||||
| //    	    int whichDevice; | ||||
| //    	    cudaGetDevice( &whichDevice ); | ||||
| //    	    cudaGetDeviceProperties( &prop, whichDevice ); | ||||
| //    	    //debug code | ||||
| //    	    if (prop.canMapHostMemory != 1) { | ||||
| //    	        printf( "Device can not map memory.\n" ); | ||||
| //    	    } | ||||
| // | ||||
| //    	    printf("L2 Cache size= %u \n",prop.l2CacheSize); | ||||
| //    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); | ||||
| //    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]); | ||||
| //    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); | ||||
| //    	    printf("deviceOverlap= %i \n",prop.deviceOverlap); | ||||
| //    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount); | ||||
| //    } | ||||
|  | ||||
| 	// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); | ||||
|  | ||||
|  | ||||
|     // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors | ||||
|  | ||||
|     size_t size = signal_length_samples * sizeof(GPU_Complex); | ||||
|  | ||||
| 	cudaMalloc((void **)&d_sig_in, size); | ||||
| 	// (cudaMalloc((void **)&d_nco_in, size)); | ||||
| 	cudaMalloc((void **)&d_sig_doppler_wiped, size); | ||||
|  | ||||
| 	// old version: all local codes are independent vectors | ||||
| 	// (cudaMalloc((void **)&d_local_codes_in, size*n_correlators)); | ||||
|  | ||||
| 	// new version: only one vector with extra samples to shift the local code for the correlator set | ||||
| 	// Required: The last correlator tap in d_shifts_samples has the largest sample shift | ||||
|     size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex); | ||||
| 	cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes); | ||||
| 	cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators); | ||||
|  | ||||
| 	//scalars | ||||
| 	cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators); | ||||
|  | ||||
|     // Launch the Vector Add CUDA Kernel | ||||
| 	threadsPerBlock = 256; | ||||
|     blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; | ||||
|  | ||||
| 	cudaStreamCreate (&stream1) ; | ||||
| 	cudaStreamCreate (&stream2) ; | ||||
| 	return true; | ||||
| } | ||||
|  | ||||
|  | ||||
| bool cuda_multicorrelator::init_cuda_integrated_resampler( | ||||
| 		const int argc, const char **argv, | ||||
| 		int signal_length_samples, | ||||
| 		int code_length_chips, | ||||
| 		int n_correlators | ||||
| @@ -480,34 +290,45 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler( | ||||
| 	// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); | ||||
|  | ||||
|     // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors | ||||
|  | ||||
|     size_t size = signal_length_samples * sizeof(GPU_Complex); | ||||
|  | ||||
| 	cudaMalloc((void **)&d_sig_in, size); | ||||
| 	cudaMemset(d_sig_in,0,size); | ||||
| 	//********* ZERO COPY VERSION ************ | ||||
| 	// Set flag to enable zero copy access | ||||
|     // Optimal in shared memory devices (like Jetson K1) | ||||
| 	cudaSetDeviceFlags(cudaDeviceMapHost); | ||||
|  | ||||
| 	// (cudaMalloc((void **)&d_nco_in, size)); | ||||
| 	//******** CudaMalloc version *********** | ||||
|  | ||||
| 	// input signal GPU memory (can be mapped to CPU memory in shared memory devices!) | ||||
| 	//	cudaMalloc((void **)&d_sig_in, size); | ||||
| 	//	cudaMemset(d_sig_in,0,size); | ||||
|  | ||||
| 	// Doppler-free signal (internal GPU memory) | ||||
| 	cudaMalloc((void **)&d_sig_doppler_wiped, size); | ||||
| 	cudaMemset(d_sig_doppler_wiped,0,size); | ||||
|  | ||||
| 	// Local code GPU memory (can be mapped to CPU memory in shared memory devices!) | ||||
| 	cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips); | ||||
| 	cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips); | ||||
|  | ||||
|     d_code_length_chips=code_length_chips; | ||||
|  | ||||
| 	// Vector with the chip shifts for each correlator tap | ||||
|     //GPU memory (can be mapped to CPU memory in shared memory devices!) | ||||
| 	cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators); | ||||
| 	cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators); | ||||
|  | ||||
| 	//scalars | ||||
| 	cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators); | ||||
| 	cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators); | ||||
| 	//cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators); | ||||
| 	//cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators); | ||||
|  | ||||
|     // Launch the Vector Add CUDA Kernel | ||||
| 	threadsPerBlock = 256; | ||||
|     // TODO: write a smart load balance using device info! | ||||
| 	threadsPerBlock = 64; | ||||
|     blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; | ||||
|  | ||||
| 	cudaStreamCreate (&stream1) ; | ||||
| 	cudaStreamCreate (&stream2) ; | ||||
| 	//cudaStreamCreate (&stream2) ; | ||||
| 	return true; | ||||
| } | ||||
|  | ||||
| @@ -518,103 +339,57 @@ bool cuda_multicorrelator::set_local_code_and_taps( | ||||
| 		int n_correlators | ||||
| 		) | ||||
| { | ||||
|     // local code CPU -> GPU copy memory | ||||
| 	//********* ZERO COPY VERSION ************ | ||||
| //	// Get device pointer from host memory. No allocation or memcpy | ||||
| //	cudaError_t code; | ||||
| //	// local code CPU -> GPU copy memory | ||||
| //	code=cudaHostGetDevicePointer((void **)&d_local_codes_in,  (void *) local_codes_in, 0); | ||||
| //	if (code!=cudaSuccess) | ||||
| //	{ | ||||
| //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); | ||||
| //	} | ||||
| //	// Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) | ||||
| //	code=cudaHostGetDevicePointer((void **)&d_shifts_chips,  (void *) shifts_chips, 0); | ||||
| //	if (code!=cudaSuccess) | ||||
| //	{ | ||||
| //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); | ||||
| //	} | ||||
|  | ||||
| 	//******** CudaMalloc version *********** | ||||
|     //local code CPU -> GPU copy memory | ||||
|     cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1); | ||||
|     d_code_length_chips=(float)code_length_chips; | ||||
|  | ||||
|     // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) | ||||
|     //Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) | ||||
|     cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators, | ||||
|                                     cudaMemcpyHostToDevice,stream1); | ||||
|  | ||||
| 	return true; | ||||
| } | ||||
|  | ||||
|  | ||||
|  | ||||
| bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( | ||||
| bool cuda_multicorrelator::set_input_output_vectors( | ||||
| 		std::complex<float>* corr_out, | ||||
| 		const std::complex<float>* sig_in, | ||||
| 		const std::complex<float>* local_codes_in, | ||||
| 		float rem_carrier_phase_in_rad, | ||||
| 		float phase_step_rad, | ||||
| 		const int *shifts_samples, | ||||
| 		int signal_length_samples, | ||||
| 		int n_correlators) | ||||
| 		std::complex<float>* sig_in | ||||
| 		) | ||||
| { | ||||
|  | ||||
| 	// Save CPU pointers | ||||
| 	d_sig_in_cpu =sig_in; | ||||
| 	d_corr_out_cpu = corr_out; | ||||
|  | ||||
| 	// Zero Copy version | ||||
| 	// Get device pointer from host memory. No allocation or memcpy | ||||
| 	cudaError_t code; | ||||
| 	code=cudaHostGetDevicePointer((void **)&d_sig_in,  (void *) sig_in, 0); | ||||
| 	code=cudaHostGetDevicePointer((void **)&d_corr_out,  (void *) corr_out, 0); | ||||
| 	if (code!=cudaSuccess) | ||||
| 	{ | ||||
| 		printf("cuda cudaHostGetDevicePointer error \r\n"); | ||||
| 	} | ||||
| 	return true; | ||||
|  | ||||
| 	size_t memSize = signal_length_samples * sizeof(std::complex<float>); | ||||
|  | ||||
| 	// input signal CPU -> GPU copy memory | ||||
|  | ||||
|     cudaMemcpyAsync(d_sig_in, sig_in, memSize, | ||||
|                                     cudaMemcpyHostToDevice, stream1); | ||||
|  | ||||
|     //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** | ||||
|     // (cudaMemcpyAsync(d_nco_in, nco_in, memSize, | ||||
|     //                                cudaMemcpyHostToDevice, stream1)); | ||||
|  | ||||
|  | ||||
| 	// old version: all local codes are independent vectors | ||||
|     // (cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators, | ||||
|     //                                cudaMemcpyHostToDevice, stream2)); | ||||
|  | ||||
| 	// new version: only one vector with extra samples to shift the local code for the correlator set | ||||
| 	// Required: The last correlator tap in d_shifts_samples has the largest sample shift | ||||
|  | ||||
|     // local code CPU -> GPU copy memory | ||||
|     cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1], | ||||
|                                     cudaMemcpyHostToDevice, stream2); | ||||
|     // Correlator shifts vector CPU -> GPU copy memory | ||||
|     cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators, | ||||
|                                     cudaMemcpyHostToDevice, stream2); | ||||
|  | ||||
|  | ||||
|     //Launch carrier wipe-off kernel here, while local codes are being copied to GPU! | ||||
|     cudaStreamSynchronize(stream1); | ||||
|     CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples); | ||||
|  | ||||
|  | ||||
|     //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); | ||||
|  | ||||
|     //wait for Doppler wipeoff end... | ||||
|     cudaStreamSynchronize(stream1); | ||||
|     cudaStreamSynchronize(stream2); | ||||
|     // (cudaDeviceSynchronize()); | ||||
|  | ||||
|     //old | ||||
| //    scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>( | ||||
| //    		d_corr_out, | ||||
| //    		d_sig_doppler_wiped, | ||||
| //    		d_local_codes_in, | ||||
| //            3, | ||||
| //            signal_length_samples | ||||
| //        ); | ||||
|  | ||||
|     //new | ||||
|     //launch the multitap correlator | ||||
|     scalarProdGPUCPXxN_shifts<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>( | ||||
| 			d_corr_out, | ||||
| 			d_sig_doppler_wiped, | ||||
| 			d_local_codes_in, | ||||
| 			d_shifts_samples, | ||||
| 			n_correlators, | ||||
| 			signal_length_samples | ||||
| 		); | ||||
|     cudaGetLastError(); | ||||
|     //wait for correlators end... | ||||
|     cudaStreamSynchronize(stream2); | ||||
|     // Copy the device result vector in device memory to the host result vector | ||||
|     // in host memory. | ||||
|  | ||||
|     //scalar products (correlators outputs) | ||||
|     cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators, | ||||
|             cudaMemcpyDeviceToHost); | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
| 		std::complex<float>* corr_out, | ||||
| 		const std::complex<float>* sig_in, | ||||
| 		float rem_carrier_phase_in_rad, | ||||
| 		float phase_step_rad, | ||||
|         float code_phase_step_chips, | ||||
| @@ -623,26 +398,40 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
| 		int n_correlators) | ||||
| 	{ | ||||
|  | ||||
| 	size_t memSize = signal_length_samples * sizeof(std::complex<float>); | ||||
|  | ||||
| 	// cudaMemCpy version | ||||
| 	//size_t memSize = signal_length_samples * sizeof(std::complex<float>); | ||||
| 	// input signal CPU -> GPU copy memory | ||||
|     cudaMemcpyAsync(d_sig_in, sig_in, memSize, | ||||
|                                     cudaMemcpyHostToDevice, stream2); | ||||
|     //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize, | ||||
|     //                               cudaMemcpyHostToDevice, stream2); | ||||
|  | ||||
|     //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** | ||||
|  | ||||
|     //Launch carrier wipe-off kernel here, while local codes are being copied to GPU! | ||||
|     cudaStreamSynchronize(stream2); | ||||
|     //cudaStreamSynchronize(stream2); | ||||
|  | ||||
|     CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples); | ||||
|     //CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples); | ||||
|  | ||||
|     //wait for Doppler wipeoff end... | ||||
|     cudaStreamSynchronize(stream1); | ||||
|     cudaStreamSynchronize(stream2); | ||||
|     //cudaStreamSynchronize(stream1); | ||||
|     //cudaStreamSynchronize(stream2); | ||||
|  | ||||
|     //launch the multitap correlator with integrated local code resampler! | ||||
|  | ||||
|     scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>( | ||||
| //    scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>( | ||||
| //			d_corr_out, | ||||
| //			d_sig_doppler_wiped, | ||||
| //			d_local_codes_in, | ||||
| //			d_shifts_chips, | ||||
| //			d_code_length_chips, | ||||
| //	        code_phase_step_chips, | ||||
| //	        rem_code_phase_chips, | ||||
| //			n_correlators, | ||||
| //			signal_length_samples | ||||
| //		); | ||||
|  | ||||
|     Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>( | ||||
| 			d_corr_out, | ||||
| 			d_sig_in, | ||||
| 			d_sig_doppler_wiped, | ||||
| 			d_local_codes_in, | ||||
| 			d_shifts_chips, | ||||
| @@ -650,23 +439,33 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
| 	        code_phase_step_chips, | ||||
| 	        rem_code_phase_chips, | ||||
| 			n_correlators, | ||||
| 			signal_length_samples | ||||
| 		); | ||||
| 			signal_length_samples, | ||||
| 			rem_carrier_phase_in_rad, | ||||
| 			phase_step_rad | ||||
| 			); | ||||
|  | ||||
|     cudaGetLastError(); | ||||
|     //debug | ||||
| //	std::complex<float>* debug_signal; | ||||
| //	debug_signal=static_cast<std::complex<float>*>(malloc(memSize)); | ||||
| //    cudaMemcpyAsync(debug_signal, d_sig_doppler_wiped, memSize, | ||||
| //            cudaMemcpyDeviceToHost,stream1); | ||||
| //    cudaStreamSynchronize(stream1); | ||||
| //	std::cout<<"d_sig_doppler_wiped GPU="<<debug_signal[456]<<","<<debug_signal[1]<<","<<debug_signal[2]<<","<<debug_signal[3]<<std::endl; | ||||
|  | ||||
|     //cudaGetLastError(); | ||||
|     //wait for correlators end... | ||||
|     cudaStreamSynchronize(stream1); | ||||
|     //cudaStreamSynchronize(stream1); | ||||
|     // Copy the device result vector in device memory to the host result vector | ||||
|     // in host memory. | ||||
|  | ||||
|     //scalar products (correlators outputs) | ||||
|     cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators, | ||||
|             cudaMemcpyDeviceToHost,stream1); | ||||
|     //cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators, | ||||
|     //        cudaMemcpyDeviceToHost,stream1); | ||||
|  | ||||
|     cudaStreamSynchronize(stream1); | ||||
|     return true; | ||||
| } | ||||
|  | ||||
|  | ||||
| cuda_multicorrelator::cuda_multicorrelator() | ||||
| { | ||||
| 	d_sig_in=NULL; | ||||
| @@ -689,22 +488,16 @@ bool cuda_multicorrelator::free_cuda() | ||||
| 	if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped); | ||||
| 	if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in); | ||||
| 	if (d_corr_out!=NULL) cudaFree(d_corr_out); | ||||
|  | ||||
|  | ||||
| 	if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples); | ||||
| 	if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips); | ||||
|  | ||||
|  | ||||
| 	cudaStreamDestroy(stream1) ; | ||||
| 	cudaStreamDestroy(stream2) ; | ||||
|  | ||||
|     // Reset the device and exit | ||||
|     // cudaDeviceReset causes the driver to clean up all state. While | ||||
|     // not mandatory in normal operation, it is good practice.  It is also | ||||
|     // needed to ensure correct operation when the application is being | ||||
|     // profiled. Calling cudaDeviceReset causes all profile data to be | ||||
|     // flushed before the application exits | ||||
| 	// (cudaDeviceReset()); | ||||
| 	cudaDeviceReset(); | ||||
| 	return true; | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -114,9 +114,7 @@ class cuda_multicorrelator | ||||
| { | ||||
| public: | ||||
|     cuda_multicorrelator(); | ||||
|     bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators); | ||||
|     bool init_cuda_integrated_resampler( | ||||
|             const int argc, const char **argv, | ||||
|             int signal_length_samples, | ||||
|             int code_length_chips, | ||||
|             int n_correlators | ||||
| @@ -127,19 +125,12 @@ public: | ||||
|             float *shifts_chips, | ||||
|             int n_correlators | ||||
|     ); | ||||
|     bool set_input_output_vectors( | ||||
|     		std::complex<float>* corr_out, | ||||
|     		std::complex<float>* sig_in | ||||
|     		); | ||||
|     bool free_cuda(); | ||||
|     bool Carrier_wipeoff_multicorrelator_cuda( | ||||
|             std::complex<float>* corr_out, | ||||
|             const std::complex<float>* sig_in, | ||||
|             const std::complex<float>* local_codes_in, | ||||
|             float rem_carrier_phase_in_rad, | ||||
|             float phase_step_rad, | ||||
|             const int *shifts_samples, | ||||
|             int signal_length_samples, | ||||
|             int n_correlators); | ||||
|     bool Carrier_wipeoff_multicorrelator_resampler_cuda( | ||||
|             std::complex<float>* corr_out, | ||||
|             const std::complex<float>* sig_in, | ||||
|             float rem_carrier_phase_in_rad, | ||||
|             float phase_step_rad, | ||||
|             float code_phase_step_chips, | ||||
| @@ -154,6 +145,11 @@ private: | ||||
|     GPU_Complex *d_sig_doppler_wiped; | ||||
|     GPU_Complex *d_local_codes_in; | ||||
|     GPU_Complex *d_corr_out; | ||||
|  | ||||
|     // | ||||
|     std::complex<float> *d_sig_in_cpu; | ||||
|     std::complex<float> *d_corr_out_cpu; | ||||
|  | ||||
|     int *d_shifts_samples; | ||||
|     float *d_shifts_chips; | ||||
|     float d_code_length_chips; | ||||
| @@ -162,7 +158,7 @@ private: | ||||
|     int blocksPerGrid; | ||||
|  | ||||
|     cudaStream_t stream1; | ||||
|     cudaStream_t stream2; | ||||
|     //cudaStream_t stream2; | ||||
|     int num_gpu_devices; | ||||
|     int selected_device; | ||||
| }; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Javier Arribas
					Javier Arribas