| 
							
							
							
						 |  |  | @@ -1,11 +1,11 @@ | 
		
	
		
			
				|  |  |  |  | /*! | 
		
	
		
			
				|  |  |  |  |  * \file cuda_multicorrelator.cu | 
		
	
		
			
				|  |  |  |  |  * \brief High optimized CUDA GPU vector multiTAP correlator class | 
		
	
		
			
				|  |  |  |  |  * \brief Highly optimized CUDA GPU vector multiTAP correlator class | 
		
	
		
			
				|  |  |  |  |  * \authors <ul> | 
		
	
		
			
				|  |  |  |  |  *          <li> Javier Arribas, 2015. jarribas(at)cttc.es | 
		
	
		
			
				|  |  |  |  |  *          </ul> | 
		
	
		
			
				|  |  |  |  |  * | 
		
	
		
			
				|  |  |  |  |  * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs | 
		
	
		
			
				|  |  |  |  |  * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs | 
		
	
		
			
				|  |  |  |  |  * | 
		
	
		
			
				|  |  |  |  |  * ------------------------------------------------------------------------- | 
		
	
		
			
				|  |  |  |  |  * | 
		
	
	
		
			
				
					
					|  |  |  | @@ -33,9 +33,8 @@ | 
		
	
		
			
				|  |  |  |  |  */ | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | #include "cuda_multicorrelator.h" | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | #include <stdio.h> | 
		
	
		
			
				|  |  |  |  | #include <iostream> | 
		
	
		
			
				|  |  |  |  | #include <stdio.h> | 
		
	
		
			
				|  |  |  |  | // For the CUDA runtime routines (prefixed with "cuda_") | 
		
	
		
			
				|  |  |  |  | #include <cuda_runtime.h> | 
		
	
		
			
				|  |  |  |  |  | 
		
	
	
		
			
				
					
					|  |  |  | @@ -53,22 +52,21 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips( | 
		
	
		
			
				|  |  |  |  |     int vectorN, | 
		
	
		
			
				|  |  |  |  |     int elementN, | 
		
	
		
			
				|  |  |  |  |     float rem_carrier_phase_in_rad, | 
		
	
		
			
				|  |  |  |  |     float phase_step_rad | 
		
	
		
			
				|  |  |  |  | ) | 
		
	
		
			
				|  |  |  |  |     float phase_step_rad) | 
		
	
		
			
				|  |  |  |  | { | 
		
	
		
			
				|  |  |  |  |     //Accumulators cache | 
		
	
		
			
				|  |  |  |  |     __shared__ GPU_Complex accumResult[ACCUM_N]; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	// CUDA version of floating point NCO and vector dot product integrated | 
		
	
		
			
				|  |  |  |  |     // CUDA version of floating point NCO and vector dot product integrated | 
		
	
		
			
				|  |  |  |  |     float sin; | 
		
	
		
			
				|  |  |  |  |     float cos; | 
		
	
		
			
				|  |  |  |  |     for (int i = blockIdx.x * blockDim.x + threadIdx.x; | 
		
	
		
			
				|  |  |  |  |          i < elementN; | 
		
	
		
			
				|  |  |  |  |          i += blockDim.x * gridDim.x) | 
		
	
		
			
				|  |  |  |  |     { | 
		
	
		
			
				|  |  |  |  |     	__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos); | 
		
	
		
			
				|  |  |  |  |     	d_sig_wiped[i] =  d_sig_in[i] * GPU_Complex(cos,-sin); | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  |         { | 
		
	
		
			
				|  |  |  |  |             __sincosf(rem_carrier_phase_in_rad + i * phase_step_rad, &sin, &cos); | 
		
	
		
			
				|  |  |  |  |             d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos, -sin); | 
		
	
		
			
				|  |  |  |  |         } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     __syncthreads(); | 
		
	
		
			
				|  |  |  |  |     //////////////////////////////////////////////////////////////////////////// | 
		
	
	
		
			
				
					
					|  |  |  | @@ -77,273 +75,279 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips( | 
		
	
		
			
				|  |  |  |  |     // from total number of thread blocks | 
		
	
		
			
				|  |  |  |  |     //////////////////////////////////////////////////////////////////////////// | 
		
	
		
			
				|  |  |  |  |     for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) | 
		
	
		
			
				|  |  |  |  |     { | 
		
	
		
			
				|  |  |  |  |         //int vectorBase = IMUL(elementN, vec); | 
		
	
		
			
				|  |  |  |  |         //int vectorEnd  = elementN; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |         //////////////////////////////////////////////////////////////////////// | 
		
	
		
			
				|  |  |  |  |         // Each accumulator cycles through vectors with | 
		
	
		
			
				|  |  |  |  |         // stride equal to number of total number of accumulators ACCUM_N | 
		
	
		
			
				|  |  |  |  |         // At this stage ACCUM_N is only preferred be a multiple of warp size | 
		
	
		
			
				|  |  |  |  |         // to meet memory coalescing alignment constraints. | 
		
	
		
			
				|  |  |  |  |         //////////////////////////////////////////////////////////////////////// | 
		
	
		
			
				|  |  |  |  |         for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) | 
		
	
		
			
				|  |  |  |  |         { | 
		
	
		
			
				|  |  |  |  |         	GPU_Complex sum = GPU_Complex(0,0); | 
		
	
		
			
				|  |  |  |  |             float local_code_chip_index=0.0;; | 
		
	
		
			
				|  |  |  |  |             //float code_phase; | 
		
	
		
			
				|  |  |  |  |             for (int pos = iAccum; pos < elementN; pos += ACCUM_N) | 
		
	
		
			
				|  |  |  |  |             { | 
		
	
		
			
				|  |  |  |  |             	//original sample code | 
		
	
		
			
				|  |  |  |  |                 //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; | 
		
	
		
			
				|  |  |  |  |             	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; | 
		
	
		
			
				|  |  |  |  |             	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]); | 
		
	
		
			
				|  |  |  |  |             //int vectorBase = IMUL(elementN, vec); | 
		
	
		
			
				|  |  |  |  |             //int vectorEnd  = elementN; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |             	//custom code for multitap correlator | 
		
	
		
			
				|  |  |  |  |             	// 1.resample local code for the current shift | 
		
	
		
			
				|  |  |  |  |             //////////////////////////////////////////////////////////////////////// | 
		
	
		
			
				|  |  |  |  |             // Each accumulator cycles through vectors with | 
		
	
		
			
				|  |  |  |  |             // stride equal to number of total number of accumulators ACCUM_N | 
		
	
		
			
				|  |  |  |  |             // At this stage ACCUM_N is only preferred be a multiple of warp size | 
		
	
		
			
				|  |  |  |  |             // to meet memory coalescing alignment constraints. | 
		
	
		
			
				|  |  |  |  |             //////////////////////////////////////////////////////////////////////// | 
		
	
		
			
				|  |  |  |  |             for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) | 
		
	
		
			
				|  |  |  |  |                 { | 
		
	
		
			
				|  |  |  |  |                     GPU_Complex sum = GPU_Complex(0, 0); | 
		
	
		
			
				|  |  |  |  |                     float local_code_chip_index = 0.0; | 
		
	
		
			
				|  |  |  |  |                     ; | 
		
	
		
			
				|  |  |  |  |                     //float code_phase; | 
		
	
		
			
				|  |  |  |  |                     for (int pos = iAccum; pos < elementN; pos += ACCUM_N) | 
		
	
		
			
				|  |  |  |  |                         { | 
		
	
		
			
				|  |  |  |  |                             //original sample code | 
		
	
		
			
				|  |  |  |  |                             //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; | 
		
	
		
			
				|  |  |  |  |                             //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; | 
		
	
		
			
				|  |  |  |  |                             //sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |             	local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); | 
		
	
		
			
				|  |  |  |  |                             //custom code for multitap correlator | 
		
	
		
			
				|  |  |  |  |                             // 1.resample local code for the current shift | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |             	//Take into account that in multitap correlators, the shifts can be negative! | 
		
	
		
			
				|  |  |  |  |             	if (local_code_chip_index<0.0) local_code_chip_index+=(code_length_chips-1); | 
		
	
		
			
				|  |  |  |  |             	//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index); | 
		
	
		
			
				|  |  |  |  |             	// 2.correlate | 
		
	
		
			
				|  |  |  |  |             	sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]); | 
		
	
		
			
				|  |  |  |  |                             local_code_chip_index = fmodf(code_phase_step_chips * __int2float_rd(pos) + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |             } | 
		
	
		
			
				|  |  |  |  |             accumResult[iAccum] = sum; | 
		
	
		
			
				|  |  |  |  |                             //Take into account that in multitap correlators, the shifts can be negative! | 
		
	
		
			
				|  |  |  |  |                             if (local_code_chip_index < 0.0) local_code_chip_index += (code_length_chips - 1); | 
		
	
		
			
				|  |  |  |  |                             //printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index); | 
		
	
		
			
				|  |  |  |  |                             // 2.correlate | 
		
	
		
			
				|  |  |  |  |                             sum.multiply_acc(d_sig_wiped[pos], d_local_code_in[__float2int_rd(local_code_chip_index)]); | 
		
	
		
			
				|  |  |  |  |                         } | 
		
	
		
			
				|  |  |  |  |                     accumResult[iAccum] = sum; | 
		
	
		
			
				|  |  |  |  |                 } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |             //////////////////////////////////////////////////////////////////////// | 
		
	
		
			
				|  |  |  |  |             // Perform tree-like reduction of accumulators' results. | 
		
	
		
			
				|  |  |  |  |             // ACCUM_N has to be power of two at this stage | 
		
	
		
			
				|  |  |  |  |             //////////////////////////////////////////////////////////////////////// | 
		
	
		
			
				|  |  |  |  |             for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) | 
		
	
		
			
				|  |  |  |  |                 { | 
		
	
		
			
				|  |  |  |  |                     __syncthreads(); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |                     for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) | 
		
	
		
			
				|  |  |  |  |                         { | 
		
	
		
			
				|  |  |  |  |                             accumResult[iAccum] += accumResult[stride + iAccum]; | 
		
	
		
			
				|  |  |  |  |                         } | 
		
	
		
			
				|  |  |  |  |                 } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |             if (threadIdx.x == 0) | 
		
	
		
			
				|  |  |  |  |                 { | 
		
	
		
			
				|  |  |  |  |                     d_corr_out[vec] = accumResult[0]; | 
		
	
		
			
				|  |  |  |  |                 } | 
		
	
		
			
				|  |  |  |  |         } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |         //////////////////////////////////////////////////////////////////////// | 
		
	
		
			
				|  |  |  |  |         // Perform tree-like reduction of accumulators' results. | 
		
	
		
			
				|  |  |  |  |         // ACCUM_N has to be power of two at this stage | 
		
	
		
			
				|  |  |  |  |         //////////////////////////////////////////////////////////////////////// | 
		
	
		
			
				|  |  |  |  |         for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) | 
		
	
		
			
				|  |  |  |  |         { | 
		
	
		
			
				|  |  |  |  |             __syncthreads(); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |             for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) | 
		
	
		
			
				|  |  |  |  |             { | 
		
	
		
			
				|  |  |  |  |                 accumResult[iAccum] += accumResult[stride + iAccum]; | 
		
	
		
			
				|  |  |  |  |             } | 
		
	
		
			
				|  |  |  |  |         } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |         if (threadIdx.x == 0) | 
		
	
		
			
				|  |  |  |  |         	{ | 
		
	
		
			
				|  |  |  |  |         		d_corr_out[vec] = accumResult[0]; | 
		
	
		
			
				|  |  |  |  |         	} | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | bool cuda_multicorrelator::init_cuda_integrated_resampler( | 
		
	
		
			
				|  |  |  |  | 		int signal_length_samples, | 
		
	
		
			
				|  |  |  |  | 		int code_length_chips, | 
		
	
		
			
				|  |  |  |  | 		int n_correlators | 
		
	
		
			
				|  |  |  |  | 		) | 
		
	
		
			
				|  |  |  |  |     int signal_length_samples, | 
		
	
		
			
				|  |  |  |  |     int code_length_chips, | 
		
	
		
			
				|  |  |  |  |     int n_correlators) | 
		
	
		
			
				|  |  |  |  | { | 
		
	
		
			
				|  |  |  |  | 	// use command-line specified CUDA device, otherwise use device with highest Gflops/s | 
		
	
		
			
				|  |  |  |  | //	findCudaDevice(argc, (const char **)argv); | 
		
	
		
			
				|  |  |  |  |       cudaDeviceProp  prop; | 
		
	
		
			
				|  |  |  |  |     // use command-line specified CUDA device, otherwise use device with highest Gflops/s | 
		
	
		
			
				|  |  |  |  |     //	findCudaDevice(argc, (const char **)argv); | 
		
	
		
			
				|  |  |  |  |     cudaDeviceProp prop; | 
		
	
		
			
				|  |  |  |  |     int num_devices, device; | 
		
	
		
			
				|  |  |  |  |     cudaGetDeviceCount(&num_devices); | 
		
	
		
			
				|  |  |  |  |     if (num_devices > 1) { | 
		
	
		
			
				|  |  |  |  |           int max_multiprocessors = 0, max_device = 0; | 
		
	
		
			
				|  |  |  |  |           for (device = 0; device < num_devices; device++) { | 
		
	
		
			
				|  |  |  |  |                   cudaDeviceProp properties; | 
		
	
		
			
				|  |  |  |  |                   cudaGetDeviceProperties(&properties, device); | 
		
	
		
			
				|  |  |  |  |                   if (max_multiprocessors < properties.multiProcessorCount) { | 
		
	
		
			
				|  |  |  |  |                           max_multiprocessors = properties.multiProcessorCount; | 
		
	
		
			
				|  |  |  |  |                           max_device = device; | 
		
	
		
			
				|  |  |  |  |                   } | 
		
	
		
			
				|  |  |  |  |                   printf("Found GPU device # %i\n",device); | 
		
	
		
			
				|  |  |  |  |           } | 
		
	
		
			
				|  |  |  |  |           //cudaSetDevice(max_device); | 
		
	
		
			
				|  |  |  |  |     if (num_devices > 1) | 
		
	
		
			
				|  |  |  |  |         { | 
		
	
		
			
				|  |  |  |  |             int max_multiprocessors = 0, max_device = 0; | 
		
	
		
			
				|  |  |  |  |             for (device = 0; device < num_devices; device++) | 
		
	
		
			
				|  |  |  |  |                 { | 
		
	
		
			
				|  |  |  |  |                     cudaDeviceProp properties; | 
		
	
		
			
				|  |  |  |  |                     cudaGetDeviceProperties(&properties, device); | 
		
	
		
			
				|  |  |  |  |                     if (max_multiprocessors < properties.multiProcessorCount) | 
		
	
		
			
				|  |  |  |  |                         { | 
		
	
		
			
				|  |  |  |  |                             max_multiprocessors = properties.multiProcessorCount; | 
		
	
		
			
				|  |  |  |  |                             max_device = device; | 
		
	
		
			
				|  |  |  |  |                         } | 
		
	
		
			
				|  |  |  |  |                     printf("Found GPU device # %i\n", device); | 
		
	
		
			
				|  |  |  |  |                 } | 
		
	
		
			
				|  |  |  |  |             //cudaSetDevice(max_device); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |           //set random device! | 
		
	
		
			
				|  |  |  |  | 	  selected_gps_device=rand() % num_devices;//generates a random number between 0 and num_devices to split the threads between GPUs | 
		
	
		
			
				|  |  |  |  |           cudaSetDevice(selected_gps_device);  | 
		
	
		
			
				|  |  |  |  |             //set random device! | 
		
	
		
			
				|  |  |  |  |             selected_gps_device = rand() % num_devices;  //generates a random number between 0 and num_devices to split the threads between GPUs | 
		
	
		
			
				|  |  |  |  |             cudaSetDevice(selected_gps_device); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |           cudaGetDeviceProperties( &prop, max_device ); | 
		
	
		
			
				|  |  |  |  |           //debug code | 
		
	
		
			
				|  |  |  |  |           if (prop.canMapHostMemory != 1) { | 
		
	
		
			
				|  |  |  |  |               printf( "Device can not map memory.\n" ); | 
		
	
		
			
				|  |  |  |  |           } | 
		
	
		
			
				|  |  |  |  |           printf("L2 Cache size= %u \n",prop.l2CacheSize); | 
		
	
		
			
				|  |  |  |  |           printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); | 
		
	
		
			
				|  |  |  |  |           printf("maxGridSize= %i \n",prop.maxGridSize[0]); | 
		
	
		
			
				|  |  |  |  |           printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); | 
		
	
		
			
				|  |  |  |  |           printf("deviceOverlap= %i \n",prop.deviceOverlap); | 
		
	
		
			
				|  |  |  |  |   	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount); | 
		
	
		
			
				|  |  |  |  |     }else{ | 
		
	
		
			
				|  |  |  |  |     	    cudaGetDevice( &selected_gps_device); | 
		
	
		
			
				|  |  |  |  |     	    cudaGetDeviceProperties( &prop, selected_gps_device ); | 
		
	
		
			
				|  |  |  |  |     	    //debug code | 
		
	
		
			
				|  |  |  |  |     	    if (prop.canMapHostMemory != 1) { | 
		
	
		
			
				|  |  |  |  |     	        printf( "Device can not map memory.\n" ); | 
		
	
		
			
				|  |  |  |  |     	    } | 
		
	
		
			
				|  |  |  |  |             cudaGetDeviceProperties(&prop, max_device); | 
		
	
		
			
				|  |  |  |  |             //debug code | 
		
	
		
			
				|  |  |  |  |             if (prop.canMapHostMemory != 1) | 
		
	
		
			
				|  |  |  |  |                 { | 
		
	
		
			
				|  |  |  |  |                     printf("Device can not map memory.\n"); | 
		
	
		
			
				|  |  |  |  |                 } | 
		
	
		
			
				|  |  |  |  |             printf("L2 Cache size= %u \n", prop.l2CacheSize); | 
		
	
		
			
				|  |  |  |  |             printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock); | 
		
	
		
			
				|  |  |  |  |             printf("maxGridSize= %i \n", prop.maxGridSize[0]); | 
		
	
		
			
				|  |  |  |  |             printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock); | 
		
	
		
			
				|  |  |  |  |             printf("deviceOverlap= %i \n", prop.deviceOverlap); | 
		
	
		
			
				|  |  |  |  |             printf("multiProcessorCount= %i \n", prop.multiProcessorCount); | 
		
	
		
			
				|  |  |  |  |         } | 
		
	
		
			
				|  |  |  |  |     else | 
		
	
		
			
				|  |  |  |  |         { | 
		
	
		
			
				|  |  |  |  |             cudaGetDevice(&selected_gps_device); | 
		
	
		
			
				|  |  |  |  |             cudaGetDeviceProperties(&prop, selected_gps_device); | 
		
	
		
			
				|  |  |  |  |             //debug code | 
		
	
		
			
				|  |  |  |  |             if (prop.canMapHostMemory != 1) | 
		
	
		
			
				|  |  |  |  |                 { | 
		
	
		
			
				|  |  |  |  |                     printf("Device can not map memory.\n"); | 
		
	
		
			
				|  |  |  |  |                 } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     	    printf("L2 Cache size= %u \n",prop.l2CacheSize); | 
		
	
		
			
				|  |  |  |  |     	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); | 
		
	
		
			
				|  |  |  |  |     	    printf("maxGridSize= %i \n",prop.maxGridSize[0]); | 
		
	
		
			
				|  |  |  |  |     	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); | 
		
	
		
			
				|  |  |  |  |     	    printf("deviceOverlap= %i \n",prop.deviceOverlap); | 
		
	
		
			
				|  |  |  |  |     	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount); | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  |             printf("L2 Cache size= %u \n", prop.l2CacheSize); | 
		
	
		
			
				|  |  |  |  |             printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock); | 
		
	
		
			
				|  |  |  |  |             printf("maxGridSize= %i \n", prop.maxGridSize[0]); | 
		
	
		
			
				|  |  |  |  |             printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock); | 
		
	
		
			
				|  |  |  |  |             printf("deviceOverlap= %i \n", prop.deviceOverlap); | 
		
	
		
			
				|  |  |  |  |             printf("multiProcessorCount= %i \n", prop.multiProcessorCount); | 
		
	
		
			
				|  |  |  |  |         } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); | 
		
	
		
			
				|  |  |  |  |     // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors | 
		
	
		
			
				|  |  |  |  |     size_t size = signal_length_samples * sizeof(GPU_Complex); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	//********* ZERO COPY VERSION ************ | 
		
	
		
			
				|  |  |  |  | 	// Set flag to enable zero copy access | 
		
	
		
			
				|  |  |  |  |     //********* ZERO COPY VERSION ************ | 
		
	
		
			
				|  |  |  |  |     // Set flag to enable zero copy access | 
		
	
		
			
				|  |  |  |  |     // Optimal in shared memory devices (like Jetson K1) | 
		
	
		
			
				|  |  |  |  | 	//cudaSetDeviceFlags(cudaDeviceMapHost); | 
		
	
		
			
				|  |  |  |  |     //cudaSetDeviceFlags(cudaDeviceMapHost); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	//******** CudaMalloc version *********** | 
		
	
		
			
				|  |  |  |  |     //******** CudaMalloc version *********** | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	// input signal GPU memory (can be mapped to CPU memory in shared memory devices!) | 
		
	
		
			
				|  |  |  |  | 	//	cudaMalloc((void **)&d_sig_in, size); | 
		
	
		
			
				|  |  |  |  | 	//	cudaMemset(d_sig_in,0,size); | 
		
	
		
			
				|  |  |  |  |     // input signal GPU memory (can be mapped to CPU memory in shared memory devices!) | 
		
	
		
			
				|  |  |  |  |     //	cudaMalloc((void **)&d_sig_in, size); | 
		
	
		
			
				|  |  |  |  |     //	cudaMemset(d_sig_in,0,size); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	// Doppler-free signal (internal GPU memory) | 
		
	
		
			
				|  |  |  |  | 	cudaMalloc((void **)&d_sig_doppler_wiped, size); | 
		
	
		
			
				|  |  |  |  | 	cudaMemset(d_sig_doppler_wiped,0,size); | 
		
	
		
			
				|  |  |  |  |     // Doppler-free signal (internal GPU memory) | 
		
	
		
			
				|  |  |  |  |     cudaMalloc((void **)&d_sig_doppler_wiped, size); | 
		
	
		
			
				|  |  |  |  |     cudaMemset(d_sig_doppler_wiped, 0, size); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	// Local code GPU memory (can be mapped to CPU memory in shared memory devices!) | 
		
	
		
			
				|  |  |  |  | 	cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips); | 
		
	
		
			
				|  |  |  |  | 	cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips); | 
		
	
		
			
				|  |  |  |  |     // Local code GPU memory (can be mapped to CPU memory in shared memory devices!) | 
		
	
		
			
				|  |  |  |  |     cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>) * code_length_chips); | 
		
	
		
			
				|  |  |  |  |     cudaMemset(d_local_codes_in, 0, sizeof(std::complex<float>) * code_length_chips); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     d_code_length_chips=code_length_chips; | 
		
	
		
			
				|  |  |  |  |     d_code_length_chips = code_length_chips; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	// Vector with the chip shifts for each correlator tap | 
		
	
		
			
				|  |  |  |  |     // Vector with the chip shifts for each correlator tap | 
		
	
		
			
				|  |  |  |  |     //GPU memory (can be mapped to CPU memory in shared memory devices!) | 
		
	
		
			
				|  |  |  |  | 	cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators); | 
		
	
		
			
				|  |  |  |  | 	cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators); | 
		
	
		
			
				|  |  |  |  |     cudaMalloc((void **)&d_shifts_chips, sizeof(float) * n_correlators); | 
		
	
		
			
				|  |  |  |  |     cudaMemset(d_shifts_chips, 0, sizeof(float) * n_correlators); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	//scalars | 
		
	
		
			
				|  |  |  |  | 	//cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators); | 
		
	
		
			
				|  |  |  |  | 	//cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators); | 
		
	
		
			
				|  |  |  |  |     //scalars | 
		
	
		
			
				|  |  |  |  |     //cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators); | 
		
	
		
			
				|  |  |  |  |     //cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     // Launch the Vector Add CUDA Kernel | 
		
	
		
			
				|  |  |  |  |     // TODO: write a smart load balance using device info! | 
		
	
		
			
				|  |  |  |  | 	threadsPerBlock = 64; | 
		
	
		
			
				|  |  |  |  |     blocksPerGrid = 128;//(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; | 
		
	
		
			
				|  |  |  |  |     threadsPerBlock = 64; | 
		
	
		
			
				|  |  |  |  |     blocksPerGrid = 128;  //(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	cudaStreamCreate (&stream1) ; | 
		
	
		
			
				|  |  |  |  | 	//cudaStreamCreate (&stream2) ; | 
		
	
		
			
				|  |  |  |  | 	return true; | 
		
	
		
			
				|  |  |  |  |     cudaStreamCreate(&stream1); | 
		
	
		
			
				|  |  |  |  |     //cudaStreamCreate (&stream2) ; | 
		
	
		
			
				|  |  |  |  |     return true; | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | bool cuda_multicorrelator::set_local_code_and_taps( | 
		
	
		
			
				|  |  |  |  | 		int code_length_chips, | 
		
	
		
			
				|  |  |  |  | 		const std::complex<float>* local_codes_in, | 
		
	
		
			
				|  |  |  |  | 		float *shifts_chips, | 
		
	
		
			
				|  |  |  |  | 		int n_correlators | 
		
	
		
			
				|  |  |  |  | 		) | 
		
	
		
			
				|  |  |  |  |     int code_length_chips, | 
		
	
		
			
				|  |  |  |  |     const std::complex<float> *local_codes_in, | 
		
	
		
			
				|  |  |  |  |     float *shifts_chips, | 
		
	
		
			
				|  |  |  |  |     int n_correlators) | 
		
	
		
			
				|  |  |  |  | { | 
		
	
		
			
				|  |  |  |  |     cudaSetDevice(selected_gps_device); | 
		
	
		
			
				|  |  |  |  |     //********* ZERO COPY VERSION ************ | 
		
	
		
			
				|  |  |  |  |     //	// Get device pointer from host memory. No allocation or memcpy | 
		
	
		
			
				|  |  |  |  |     //	cudaError_t code; | 
		
	
		
			
				|  |  |  |  |     //	// local code CPU -> GPU copy memory | 
		
	
		
			
				|  |  |  |  |     //	code=cudaHostGetDevicePointer((void **)&d_local_codes_in,  (void *) local_codes_in, 0); | 
		
	
		
			
				|  |  |  |  |     //	if (code!=cudaSuccess) | 
		
	
		
			
				|  |  |  |  |     //	{ | 
		
	
		
			
				|  |  |  |  |     //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); | 
		
	
		
			
				|  |  |  |  |     //	} | 
		
	
		
			
				|  |  |  |  |     //	// Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) | 
		
	
		
			
				|  |  |  |  |     //	code=cudaHostGetDevicePointer((void **)&d_shifts_chips,  (void *) shifts_chips, 0); | 
		
	
		
			
				|  |  |  |  |     //	if (code!=cudaSuccess) | 
		
	
		
			
				|  |  |  |  |     //	{ | 
		
	
		
			
				|  |  |  |  |     //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); | 
		
	
		
			
				|  |  |  |  |     //	} | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |           cudaSetDevice(selected_gps_device); | 
		
	
		
			
				|  |  |  |  | 	//********* ZERO COPY VERSION ************ | 
		
	
		
			
				|  |  |  |  | //	// Get device pointer from host memory. No allocation or memcpy | 
		
	
		
			
				|  |  |  |  | //	cudaError_t code; | 
		
	
		
			
				|  |  |  |  | //	// local code CPU -> GPU copy memory | 
		
	
		
			
				|  |  |  |  | //	code=cudaHostGetDevicePointer((void **)&d_local_codes_in,  (void *) local_codes_in, 0); | 
		
	
		
			
				|  |  |  |  | //	if (code!=cudaSuccess) | 
		
	
		
			
				|  |  |  |  | //	{ | 
		
	
		
			
				|  |  |  |  | //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); | 
		
	
		
			
				|  |  |  |  | //	} | 
		
	
		
			
				|  |  |  |  | //	// Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) | 
		
	
		
			
				|  |  |  |  | //	code=cudaHostGetDevicePointer((void **)&d_shifts_chips,  (void *) shifts_chips, 0); | 
		
	
		
			
				|  |  |  |  | //	if (code!=cudaSuccess) | 
		
	
		
			
				|  |  |  |  | //	{ | 
		
	
		
			
				|  |  |  |  | //		printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); | 
		
	
		
			
				|  |  |  |  | //	} | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	//******** CudaMalloc version *********** | 
		
	
		
			
				|  |  |  |  |     //******** CudaMalloc version *********** | 
		
	
		
			
				|  |  |  |  |     //local code CPU -> GPU copy memory | 
		
	
		
			
				|  |  |  |  |     cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1); | 
		
	
		
			
				|  |  |  |  |     d_code_length_chips=code_length_chips; | 
		
	
		
			
				|  |  |  |  |     cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex) * code_length_chips, cudaMemcpyHostToDevice, stream1); | 
		
	
		
			
				|  |  |  |  |     d_code_length_chips = code_length_chips; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     //Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) | 
		
	
		
			
				|  |  |  |  |     cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators, | 
		
	
		
			
				|  |  |  |  |                                     cudaMemcpyHostToDevice,stream1); | 
		
	
		
			
				|  |  |  |  |     cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float) * n_correlators, | 
		
	
		
			
				|  |  |  |  |         cudaMemcpyHostToDevice, stream1); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	return true; | 
		
	
		
			
				|  |  |  |  |     return true; | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | bool cuda_multicorrelator::set_input_output_vectors( | 
		
	
		
			
				|  |  |  |  | 		std::complex<float>* corr_out, | 
		
	
		
			
				|  |  |  |  | 		std::complex<float>* sig_in | 
		
	
		
			
				|  |  |  |  | 		) | 
		
	
		
			
				|  |  |  |  |     std::complex<float> *corr_out, | 
		
	
		
			
				|  |  |  |  |     std::complex<float> *sig_in) | 
		
	
		
			
				|  |  |  |  | { | 
		
	
		
			
				|  |  |  |  |     cudaSetDevice(selected_gps_device); | 
		
	
		
			
				|  |  |  |  |     // Save CPU pointers | 
		
	
		
			
				|  |  |  |  |     d_sig_in_cpu = sig_in; | 
		
	
		
			
				|  |  |  |  |     d_corr_out_cpu = corr_out; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |          cudaSetDevice(selected_gps_device); | 
		
	
		
			
				|  |  |  |  | 	// Save CPU pointers | 
		
	
		
			
				|  |  |  |  | 	d_sig_in_cpu =sig_in; | 
		
	
		
			
				|  |  |  |  | 	d_corr_out_cpu = corr_out; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	// Zero Copy version | 
		
	
		
			
				|  |  |  |  | 	// Get device pointer from host memory. No allocation or memcpy | 
		
	
		
			
				|  |  |  |  | 	cudaError_t code; | 
		
	
		
			
				|  |  |  |  | 	code=cudaHostGetDevicePointer((void **)&d_sig_in,  (void *) sig_in, 0); | 
		
	
		
			
				|  |  |  |  | 	code=cudaHostGetDevicePointer((void **)&d_corr_out,  (void *) corr_out, 0); | 
		
	
		
			
				|  |  |  |  | 	if (code!=cudaSuccess) | 
		
	
		
			
				|  |  |  |  | 	{ | 
		
	
		
			
				|  |  |  |  | 		printf("cuda cudaHostGetDevicePointer error \r\n"); | 
		
	
		
			
				|  |  |  |  | 	} | 
		
	
		
			
				|  |  |  |  | 	return true; | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     // Zero Copy version | 
		
	
		
			
				|  |  |  |  |     // Get device pointer from host memory. No allocation or memcpy | 
		
	
		
			
				|  |  |  |  |     cudaError_t code; | 
		
	
		
			
				|  |  |  |  |     code = cudaHostGetDevicePointer((void **)&d_sig_in, (void *)sig_in, 0); | 
		
	
		
			
				|  |  |  |  |     code = cudaHostGetDevicePointer((void **)&d_corr_out, (void *)corr_out, 0); | 
		
	
		
			
				|  |  |  |  |     if (code != cudaSuccess) | 
		
	
		
			
				|  |  |  |  |         { | 
		
	
		
			
				|  |  |  |  |             printf("cuda cudaHostGetDevicePointer error \r\n"); | 
		
	
		
			
				|  |  |  |  |         } | 
		
	
		
			
				|  |  |  |  |     return true; | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } | 
		
	
		
			
				|  |  |  |  | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) | 
		
	
		
			
				|  |  |  |  | #define gpuErrchk(ans)                        \ | 
		
	
		
			
				|  |  |  |  |     {                                         \ | 
		
	
		
			
				|  |  |  |  |         gpuAssert((ans), __FILE__, __LINE__); \ | 
		
	
		
			
				|  |  |  |  |     } | 
		
	
		
			
				|  |  |  |  | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) | 
		
	
		
			
				|  |  |  |  | { | 
		
	
		
			
				|  |  |  |  |    if (code != cudaSuccess) | 
		
	
		
			
				|  |  |  |  |    { | 
		
	
		
			
				|  |  |  |  |       fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); | 
		
	
		
			
				|  |  |  |  |       if (abort) exit(code); | 
		
	
		
			
				|  |  |  |  |    } | 
		
	
		
			
				|  |  |  |  |     if (code != cudaSuccess) | 
		
	
		
			
				|  |  |  |  |         { | 
		
	
		
			
				|  |  |  |  |             fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); | 
		
	
		
			
				|  |  |  |  |             if (abort) exit(code); | 
		
	
		
			
				|  |  |  |  |         } | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( | 
		
	
		
			
				|  |  |  |  | 		float rem_carrier_phase_in_rad, | 
		
	
		
			
				|  |  |  |  | 		float phase_step_rad, | 
		
	
		
			
				|  |  |  |  |         float code_phase_step_chips, | 
		
	
		
			
				|  |  |  |  |         float rem_code_phase_chips, | 
		
	
		
			
				|  |  |  |  | 		int signal_length_samples, | 
		
	
		
			
				|  |  |  |  | 		int n_correlators) | 
		
	
		
			
				|  |  |  |  | 	{ | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     float rem_carrier_phase_in_rad, | 
		
	
		
			
				|  |  |  |  |     float phase_step_rad, | 
		
	
		
			
				|  |  |  |  |     float code_phase_step_chips, | 
		
	
		
			
				|  |  |  |  |     float rem_code_phase_chips, | 
		
	
		
			
				|  |  |  |  |     int signal_length_samples, | 
		
	
		
			
				|  |  |  |  |     int n_correlators) | 
		
	
		
			
				|  |  |  |  | { | 
		
	
		
			
				|  |  |  |  |     cudaSetDevice(selected_gps_device); | 
		
	
		
			
				|  |  |  |  | 	// cudaMemCpy version | 
		
	
		
			
				|  |  |  |  | 	//size_t memSize = signal_length_samples * sizeof(std::complex<float>); | 
		
	
		
			
				|  |  |  |  | 	// input signal CPU -> GPU copy memory | 
		
	
		
			
				|  |  |  |  |     // cudaMemCpy version | 
		
	
		
			
				|  |  |  |  |     //size_t memSize = signal_length_samples * sizeof(std::complex<float>); | 
		
	
		
			
				|  |  |  |  |     // input signal CPU -> GPU copy memory | 
		
	
		
			
				|  |  |  |  |     //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize, | 
		
	
		
			
				|  |  |  |  |     //                               cudaMemcpyHostToDevice, stream2); | 
		
	
		
			
				|  |  |  |  |     //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     //launch the multitap correlator with integrated local code resampler! | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>( | 
		
	
		
			
				|  |  |  |  | 			d_corr_out, | 
		
	
		
			
				|  |  |  |  | 			d_sig_in, | 
		
	
		
			
				|  |  |  |  | 			d_sig_doppler_wiped, | 
		
	
		
			
				|  |  |  |  | 			d_local_codes_in, | 
		
	
		
			
				|  |  |  |  | 			d_shifts_chips, | 
		
	
		
			
				|  |  |  |  | 			d_code_length_chips, | 
		
	
		
			
				|  |  |  |  | 	        code_phase_step_chips, | 
		
	
		
			
				|  |  |  |  | 	        rem_code_phase_chips, | 
		
	
		
			
				|  |  |  |  | 			n_correlators, | 
		
	
		
			
				|  |  |  |  | 			signal_length_samples, | 
		
	
		
			
				|  |  |  |  | 			rem_carrier_phase_in_rad, | 
		
	
		
			
				|  |  |  |  | 			phase_step_rad | 
		
	
		
			
				|  |  |  |  | 			); | 
		
	
		
			
				|  |  |  |  |     Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>( | 
		
	
		
			
				|  |  |  |  |         d_corr_out, | 
		
	
		
			
				|  |  |  |  |         d_sig_in, | 
		
	
		
			
				|  |  |  |  |         d_sig_doppler_wiped, | 
		
	
		
			
				|  |  |  |  |         d_local_codes_in, | 
		
	
		
			
				|  |  |  |  |         d_shifts_chips, | 
		
	
		
			
				|  |  |  |  |         d_code_length_chips, | 
		
	
		
			
				|  |  |  |  |         code_phase_step_chips, | 
		
	
		
			
				|  |  |  |  |         rem_code_phase_chips, | 
		
	
		
			
				|  |  |  |  |         n_correlators, | 
		
	
		
			
				|  |  |  |  |         signal_length_samples, | 
		
	
		
			
				|  |  |  |  |         rem_carrier_phase_in_rad, | 
		
	
		
			
				|  |  |  |  |         phase_step_rad); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |     gpuErrchk( cudaPeekAtLastError() ); | 
		
	
		
			
				|  |  |  |  |     gpuErrchk( cudaStreamSynchronize(stream1)); | 
		
	
		
			
				|  |  |  |  |     gpuErrchk(cudaPeekAtLastError()); | 
		
	
		
			
				|  |  |  |  |     gpuErrchk(cudaStreamSynchronize(stream1)); | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | 	// cudaMemCpy version | 
		
	
		
			
				|  |  |  |  |     // cudaMemCpy version | 
		
	
		
			
				|  |  |  |  |     // Copy the device result vector in device memory to the host result vector | 
		
	
		
			
				|  |  |  |  |     // in host memory. | 
		
	
		
			
				|  |  |  |  |     //scalar products (correlators outputs) | 
		
	
	
		
			
				
					
					|  |  |  | @@ -352,37 +356,38 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( | 
		
	
		
			
				|  |  |  |  |     return true; | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | cuda_multicorrelator::cuda_multicorrelator() | 
		
	
		
			
				|  |  |  |  | { | 
		
	
		
			
				|  |  |  |  | 	d_sig_in=NULL; | 
		
	
		
			
				|  |  |  |  | 	d_nco_in=NULL; | 
		
	
		
			
				|  |  |  |  | 	d_sig_doppler_wiped=NULL; | 
		
	
		
			
				|  |  |  |  | 	d_local_codes_in=NULL; | 
		
	
		
			
				|  |  |  |  | 	d_shifts_samples=NULL; | 
		
	
		
			
				|  |  |  |  | 	d_shifts_chips=NULL; | 
		
	
		
			
				|  |  |  |  | 	d_corr_out=NULL; | 
		
	
		
			
				|  |  |  |  | 	threadsPerBlock=0; | 
		
	
		
			
				|  |  |  |  | 	blocksPerGrid=0; | 
		
	
		
			
				|  |  |  |  | 	d_code_length_chips=0; | 
		
	
		
			
				|  |  |  |  |     d_sig_in = NULL; | 
		
	
		
			
				|  |  |  |  |     d_nco_in = NULL; | 
		
	
		
			
				|  |  |  |  |     d_sig_doppler_wiped = NULL; | 
		
	
		
			
				|  |  |  |  |     d_local_codes_in = NULL; | 
		
	
		
			
				|  |  |  |  |     d_shifts_samples = NULL; | 
		
	
		
			
				|  |  |  |  |     d_shifts_chips = NULL; | 
		
	
		
			
				|  |  |  |  |     d_corr_out = NULL; | 
		
	
		
			
				|  |  |  |  |     threadsPerBlock = 0; | 
		
	
		
			
				|  |  |  |  |     blocksPerGrid = 0; | 
		
	
		
			
				|  |  |  |  |     d_code_length_chips = 0; | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  |  | 
		
	
		
			
				|  |  |  |  | bool cuda_multicorrelator::free_cuda() | 
		
	
		
			
				|  |  |  |  | { | 
		
	
		
			
				|  |  |  |  | 	// Free device global memory | 
		
	
		
			
				|  |  |  |  | 	if (d_sig_in!=NULL) cudaFree(d_sig_in); | 
		
	
		
			
				|  |  |  |  | 	if (d_nco_in!=NULL) cudaFree(d_nco_in); | 
		
	
		
			
				|  |  |  |  | 	if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped); | 
		
	
		
			
				|  |  |  |  | 	if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in); | 
		
	
		
			
				|  |  |  |  | 	if (d_corr_out!=NULL) cudaFree(d_corr_out); | 
		
	
		
			
				|  |  |  |  | 	if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples); | 
		
	
		
			
				|  |  |  |  | 	if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips); | 
		
	
		
			
				|  |  |  |  |     // Free device global memory | 
		
	
		
			
				|  |  |  |  |     if (d_sig_in != NULL) cudaFree(d_sig_in); | 
		
	
		
			
				|  |  |  |  |     if (d_nco_in != NULL) cudaFree(d_nco_in); | 
		
	
		
			
				|  |  |  |  |     if (d_sig_doppler_wiped != NULL) cudaFree(d_sig_doppler_wiped); | 
		
	
		
			
				|  |  |  |  |     if (d_local_codes_in != NULL) cudaFree(d_local_codes_in); | 
		
	
		
			
				|  |  |  |  |     if (d_corr_out != NULL) cudaFree(d_corr_out); | 
		
	
		
			
				|  |  |  |  |     if (d_shifts_samples != NULL) cudaFree(d_shifts_samples); | 
		
	
		
			
				|  |  |  |  |     if (d_shifts_chips != NULL) cudaFree(d_shifts_chips); | 
		
	
		
			
				|  |  |  |  |     // Reset the device and exit | 
		
	
		
			
				|  |  |  |  |     // cudaDeviceReset causes the driver to clean up all state. While | 
		
	
		
			
				|  |  |  |  |     // not mandatory in normal operation, it is good practice.  It is also | 
		
	
		
			
				|  |  |  |  |     // needed to ensure correct operation when the application is being | 
		
	
		
			
				|  |  |  |  |     // profiled. Calling cudaDeviceReset causes all profile data to be | 
		
	
		
			
				|  |  |  |  |     // flushed before the application exits | 
		
	
		
			
				|  |  |  |  | 	cudaDeviceReset(); | 
		
	
		
			
				|  |  |  |  | 	return true; | 
		
	
		
			
				|  |  |  |  |     cudaDeviceReset(); | 
		
	
		
			
				|  |  |  |  |     return true; | 
		
	
		
			
				|  |  |  |  | } | 
		
	
		
			
				|  |  |  |  |  | 
		
	
	
		
			
				
					
					| 
							
							
							
						 |  |  |   |