diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator.cc b/src/algorithms/tracking/libs/cpu_multicorrelator.cc index c201a5df7..53ee5056c 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator.cc +++ b/src/algorithms/tracking/libs/cpu_multicorrelator.cc @@ -1,11 +1,11 @@ /*! * \file cpu_multicorrelator.cc - * \brief High optimized CPU vector multiTAP correlator class + * \brief Highly optimized CPU vector multiTAP correlator class * \authors * - * Class that implements a high optimized vector multiTAP correlator class for CPUs + * Class that implements a highly optimized vector multiTAP correlator class for CPUs * * ------------------------------------------------------------------------- * diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator.h b/src/algorithms/tracking/libs/cpu_multicorrelator.h index d0f73d616..17c1179b6 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator.h +++ b/src/algorithms/tracking/libs/cpu_multicorrelator.h @@ -65,4 +65,4 @@ private: }; -#endif /* CPU_MULTICORRELATOR_H_ */ +#endif /* GNSS_SDR_CPU_MULTICORRELATOR_H_ */ diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc index 107a38f7d..82eda568c 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc +++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc @@ -1,11 +1,11 @@ /*! * \file cpu_multicorrelator_16sc.cc - * \brief High optimized CPU vector multiTAP correlator class + * \brief Highly optimized CPU vector multiTAP correlator class * \authors * - * Class that implements a high optimized vector multiTAP correlator class for CPUs + * Class that implements a highly optimized vector multiTAP correlator class for CPUs * * ------------------------------------------------------------------------- * diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h index 7ef9549e8..10c831462 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h +++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h @@ -1,11 +1,11 @@ /*! * \file cpu_multicorrelator_16sc.h - * \brief High optimized CPU vector multiTAP correlator class for lv_16sc_t (short int complex) + * \brief Highly optimized CPU vector multiTAP correlator class for lv_16sc_t (short int complex) * \authors * - * Class that implements a high optimized vector multiTAP correlator class for CPUs + * Class that implements a highly optimized vector multiTAP correlator class for CPUs * * ------------------------------------------------------------------------- * diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc index 4bd70cde6..73219b4e2 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc +++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc @@ -6,7 +6,7 @@ *
  • Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com * * - * Class that implements a high optimized vector multiTAP correlator class for CPUs + * Class that implements a highly optimized vector multiTAP correlator class for CPUs * * ------------------------------------------------------------------------- * diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h index 48a94506d..83f083da3 100644 --- a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h +++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h @@ -6,7 +6,7 @@ *
  • Cillian O'Driscoll, 2017, cillian.odriscoll(at)gmail.com * * - * Class that implements a high optimized vector multiTAP correlator class for CPUs + * Class that implements a highly optimized vector multiTAP correlator class for CPUs * * ------------------------------------------------------------------------- * diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu index 38aefbcd5..27a46d973 100644 --- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu @@ -1,11 +1,11 @@ /*! * \file cuda_multicorrelator.cu - * \brief High optimized CUDA GPU vector multiTAP correlator class + * \brief Highly optimized CUDA GPU vector multiTAP correlator class * \authors * - * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs + * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs * * ------------------------------------------------------------------------- * @@ -33,9 +33,8 @@ */ #include "cuda_multicorrelator.h" - -#include #include +#include // For the CUDA runtime routines (prefixed with "cuda_") #include @@ -53,22 +52,21 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips( int vectorN, int elementN, float rem_carrier_phase_in_rad, - float phase_step_rad -) + float phase_step_rad) { //Accumulators cache __shared__ GPU_Complex accumResult[ACCUM_N]; - // CUDA version of floating point NCO and vector dot product integrated + // CUDA version of floating point NCO and vector dot product integrated float sin; float cos; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < elementN; i += blockDim.x * gridDim.x) - { - __sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos); - d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos,-sin); - } + { + __sincosf(rem_carrier_phase_in_rad + i * phase_step_rad, &sin, &cos); + d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos, -sin); + } __syncthreads(); //////////////////////////////////////////////////////////////////////////// @@ -77,273 +75,279 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips( // from total number of thread blocks //////////////////////////////////////////////////////////////////////////// for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) - { - //int vectorBase = IMUL(elementN, vec); - //int vectorEnd = elementN; - - //////////////////////////////////////////////////////////////////////// - // Each accumulator cycles through vectors with - // stride equal to number of total number of accumulators ACCUM_N - // At this stage ACCUM_N is only preferred be a multiple of warp size - // to meet memory coalescing alignment constraints. - //////////////////////////////////////////////////////////////////////// - for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) { - GPU_Complex sum = GPU_Complex(0,0); - float local_code_chip_index=0.0;; - //float code_phase; - for (int pos = iAccum; pos < elementN; pos += ACCUM_N) - { - //original sample code - //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; - //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; - //sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]); + //int vectorBase = IMUL(elementN, vec); + //int vectorEnd = elementN; - //custom code for multitap correlator - // 1.resample local code for the current shift + //////////////////////////////////////////////////////////////////////// + // Each accumulator cycles through vectors with + // stride equal to number of total number of accumulators ACCUM_N + // At this stage ACCUM_N is only preferred be a multiple of warp size + // to meet memory coalescing alignment constraints. + //////////////////////////////////////////////////////////////////////// + for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) + { + GPU_Complex sum = GPU_Complex(0, 0); + float local_code_chip_index = 0.0; + ; + //float code_phase; + for (int pos = iAccum; pos < elementN; pos += ACCUM_N) + { + //original sample code + //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; + //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; + //sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]); - local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); + //custom code for multitap correlator + // 1.resample local code for the current shift - //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index<0.0) local_code_chip_index+=(code_length_chips-1); - //printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index); - // 2.correlate - sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]); + local_code_chip_index = fmodf(code_phase_step_chips * __int2float_rd(pos) + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); - } - accumResult[iAccum] = sum; + //Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index < 0.0) local_code_chip_index += (code_length_chips - 1); + //printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index); + // 2.correlate + sum.multiply_acc(d_sig_wiped[pos], d_local_code_in[__float2int_rd(local_code_chip_index)]); + } + accumResult[iAccum] = sum; + } + + //////////////////////////////////////////////////////////////////////// + // Perform tree-like reduction of accumulators' results. + // ACCUM_N has to be power of two at this stage + //////////////////////////////////////////////////////////////////////// + for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) + { + __syncthreads(); + + for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) + { + accumResult[iAccum] += accumResult[stride + iAccum]; + } + } + + if (threadIdx.x == 0) + { + d_corr_out[vec] = accumResult[0]; + } } - - //////////////////////////////////////////////////////////////////////// - // Perform tree-like reduction of accumulators' results. - // ACCUM_N has to be power of two at this stage - //////////////////////////////////////////////////////////////////////// - for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) - { - __syncthreads(); - - for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) - { - accumResult[iAccum] += accumResult[stride + iAccum]; - } - } - - if (threadIdx.x == 0) - { - d_corr_out[vec] = accumResult[0]; - } - } } + bool cuda_multicorrelator::init_cuda_integrated_resampler( - int signal_length_samples, - int code_length_chips, - int n_correlators - ) + int signal_length_samples, + int code_length_chips, + int n_correlators) { - // use command-line specified CUDA device, otherwise use device with highest Gflops/s -// findCudaDevice(argc, (const char **)argv); - cudaDeviceProp prop; + // use command-line specified CUDA device, otherwise use device with highest Gflops/s + // findCudaDevice(argc, (const char **)argv); + cudaDeviceProp prop; int num_devices, device; cudaGetDeviceCount(&num_devices); - if (num_devices > 1) { - int max_multiprocessors = 0, max_device = 0; - for (device = 0; device < num_devices; device++) { - cudaDeviceProp properties; - cudaGetDeviceProperties(&properties, device); - if (max_multiprocessors < properties.multiProcessorCount) { - max_multiprocessors = properties.multiProcessorCount; - max_device = device; - } - printf("Found GPU device # %i\n",device); - } - //cudaSetDevice(max_device); + if (num_devices > 1) + { + int max_multiprocessors = 0, max_device = 0; + for (device = 0; device < num_devices; device++) + { + cudaDeviceProp properties; + cudaGetDeviceProperties(&properties, device); + if (max_multiprocessors < properties.multiProcessorCount) + { + max_multiprocessors = properties.multiProcessorCount; + max_device = device; + } + printf("Found GPU device # %i\n", device); + } + //cudaSetDevice(max_device); - //set random device! - selected_gps_device=rand() % num_devices;//generates a random number between 0 and num_devices to split the threads between GPUs - cudaSetDevice(selected_gps_device); + //set random device! + selected_gps_device = rand() % num_devices; //generates a random number between 0 and num_devices to split the threads between GPUs + cudaSetDevice(selected_gps_device); - cudaGetDeviceProperties( &prop, max_device ); - //debug code - if (prop.canMapHostMemory != 1) { - printf( "Device can not map memory.\n" ); - } - printf("L2 Cache size= %u \n",prop.l2CacheSize); - printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); - printf("maxGridSize= %i \n",prop.maxGridSize[0]); - printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); - printf("deviceOverlap= %i \n",prop.deviceOverlap); - printf("multiProcessorCount= %i \n",prop.multiProcessorCount); - }else{ - cudaGetDevice( &selected_gps_device); - cudaGetDeviceProperties( &prop, selected_gps_device ); - //debug code - if (prop.canMapHostMemory != 1) { - printf( "Device can not map memory.\n" ); - } + cudaGetDeviceProperties(&prop, max_device); + //debug code + if (prop.canMapHostMemory != 1) + { + printf("Device can not map memory.\n"); + } + printf("L2 Cache size= %u \n", prop.l2CacheSize); + printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock); + printf("maxGridSize= %i \n", prop.maxGridSize[0]); + printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock); + printf("deviceOverlap= %i \n", prop.deviceOverlap); + printf("multiProcessorCount= %i \n", prop.multiProcessorCount); + } + else + { + cudaGetDevice(&selected_gps_device); + cudaGetDeviceProperties(&prop, selected_gps_device); + //debug code + if (prop.canMapHostMemory != 1) + { + printf("Device can not map memory.\n"); + } - printf("L2 Cache size= %u \n",prop.l2CacheSize); - printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); - printf("maxGridSize= %i \n",prop.maxGridSize[0]); - printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); - printf("deviceOverlap= %i \n",prop.deviceOverlap); - printf("multiProcessorCount= %i \n",prop.multiProcessorCount); - } + printf("L2 Cache size= %u \n", prop.l2CacheSize); + printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock); + printf("maxGridSize= %i \n", prop.maxGridSize[0]); + printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock); + printf("deviceOverlap= %i \n", prop.deviceOverlap); + printf("multiProcessorCount= %i \n", prop.multiProcessorCount); + } - // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); + // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors size_t size = signal_length_samples * sizeof(GPU_Complex); - //********* ZERO COPY VERSION ************ - // Set flag to enable zero copy access + //********* ZERO COPY VERSION ************ + // Set flag to enable zero copy access // Optimal in shared memory devices (like Jetson K1) - //cudaSetDeviceFlags(cudaDeviceMapHost); + //cudaSetDeviceFlags(cudaDeviceMapHost); - //******** CudaMalloc version *********** + //******** CudaMalloc version *********** - // input signal GPU memory (can be mapped to CPU memory in shared memory devices!) - // cudaMalloc((void **)&d_sig_in, size); - // cudaMemset(d_sig_in,0,size); + // input signal GPU memory (can be mapped to CPU memory in shared memory devices!) + // cudaMalloc((void **)&d_sig_in, size); + // cudaMemset(d_sig_in,0,size); - // Doppler-free signal (internal GPU memory) - cudaMalloc((void **)&d_sig_doppler_wiped, size); - cudaMemset(d_sig_doppler_wiped,0,size); + // Doppler-free signal (internal GPU memory) + cudaMalloc((void **)&d_sig_doppler_wiped, size); + cudaMemset(d_sig_doppler_wiped, 0, size); - // Local code GPU memory (can be mapped to CPU memory in shared memory devices!) - cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex)*code_length_chips); - cudaMemset(d_local_codes_in,0,sizeof(std::complex)*code_length_chips); + // Local code GPU memory (can be mapped to CPU memory in shared memory devices!) + cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex) * code_length_chips); + cudaMemset(d_local_codes_in, 0, sizeof(std::complex) * code_length_chips); - d_code_length_chips=code_length_chips; + d_code_length_chips = code_length_chips; - // Vector with the chip shifts for each correlator tap + // Vector with the chip shifts for each correlator tap //GPU memory (can be mapped to CPU memory in shared memory devices!) - cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators); - cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators); + cudaMalloc((void **)&d_shifts_chips, sizeof(float) * n_correlators); + cudaMemset(d_shifts_chips, 0, sizeof(float) * n_correlators); - //scalars - //cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators); - //cudaMemset(d_corr_out,0,sizeof(std::complex)*n_correlators); + //scalars + //cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators); + //cudaMemset(d_corr_out,0,sizeof(std::complex)*n_correlators); // Launch the Vector Add CUDA Kernel // TODO: write a smart load balance using device info! - threadsPerBlock = 64; - blocksPerGrid = 128;//(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; + threadsPerBlock = 64; + blocksPerGrid = 128; //(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; - cudaStreamCreate (&stream1) ; - //cudaStreamCreate (&stream2) ; - return true; + cudaStreamCreate(&stream1); + //cudaStreamCreate (&stream2) ; + return true; } + bool cuda_multicorrelator::set_local_code_and_taps( - int code_length_chips, - const std::complex* local_codes_in, - float *shifts_chips, - int n_correlators - ) + int code_length_chips, + const std::complex *local_codes_in, + float *shifts_chips, + int n_correlators) { + cudaSetDevice(selected_gps_device); + //********* ZERO COPY VERSION ************ + // // Get device pointer from host memory. No allocation or memcpy + // cudaError_t code; + // // local code CPU -> GPU copy memory + // code=cudaHostGetDevicePointer((void **)&d_local_codes_in, (void *) local_codes_in, 0); + // if (code!=cudaSuccess) + // { + // printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); + // } + // // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) + // code=cudaHostGetDevicePointer((void **)&d_shifts_chips, (void *) shifts_chips, 0); + // if (code!=cudaSuccess) + // { + // printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); + // } - cudaSetDevice(selected_gps_device); - //********* ZERO COPY VERSION ************ -// // Get device pointer from host memory. No allocation or memcpy -// cudaError_t code; -// // local code CPU -> GPU copy memory -// code=cudaHostGetDevicePointer((void **)&d_local_codes_in, (void *) local_codes_in, 0); -// if (code!=cudaSuccess) -// { -// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); -// } -// // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) -// code=cudaHostGetDevicePointer((void **)&d_shifts_chips, (void *) shifts_chips, 0); -// if (code!=cudaSuccess) -// { -// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n"); -// } - - //******** CudaMalloc version *********** + //******** CudaMalloc version *********** //local code CPU -> GPU copy memory - cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1); - d_code_length_chips=code_length_chips; + cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex) * code_length_chips, cudaMemcpyHostToDevice, stream1); + d_code_length_chips = code_length_chips; //Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) - cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators, - cudaMemcpyHostToDevice,stream1); + cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float) * n_correlators, + cudaMemcpyHostToDevice, stream1); - return true; + return true; } + bool cuda_multicorrelator::set_input_output_vectors( - std::complex* corr_out, - std::complex* sig_in - ) + std::complex *corr_out, + std::complex *sig_in) { + cudaSetDevice(selected_gps_device); + // Save CPU pointers + d_sig_in_cpu = sig_in; + d_corr_out_cpu = corr_out; - cudaSetDevice(selected_gps_device); - // Save CPU pointers - d_sig_in_cpu =sig_in; - d_corr_out_cpu = corr_out; - - // Zero Copy version - // Get device pointer from host memory. No allocation or memcpy - cudaError_t code; - code=cudaHostGetDevicePointer((void **)&d_sig_in, (void *) sig_in, 0); - code=cudaHostGetDevicePointer((void **)&d_corr_out, (void *) corr_out, 0); - if (code!=cudaSuccess) - { - printf("cuda cudaHostGetDevicePointer error \r\n"); - } - return true; - + // Zero Copy version + // Get device pointer from host memory. No allocation or memcpy + cudaError_t code; + code = cudaHostGetDevicePointer((void **)&d_sig_in, (void *)sig_in, 0); + code = cudaHostGetDevicePointer((void **)&d_corr_out, (void *)corr_out, 0); + if (code != cudaSuccess) + { + printf("cuda cudaHostGetDevicePointer error \r\n"); + } + return true; } -#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) +#define gpuErrchk(ans) \ + { \ + gpuAssert((ans), __FILE__, __LINE__); \ + } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { - if (code != cudaSuccess) - { - fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } + if (code != cudaSuccess) + { + fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } } + bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( - float rem_carrier_phase_in_rad, - float phase_step_rad, - float code_phase_step_chips, - float rem_code_phase_chips, - int signal_length_samples, - int n_correlators) - { - - cudaSetDevice(selected_gps_device); - // cudaMemCpy version - //size_t memSize = signal_length_samples * sizeof(std::complex); - // input signal CPU -> GPU copy memory + float rem_carrier_phase_in_rad, + float phase_step_rad, + float code_phase_step_chips, + float rem_code_phase_chips, + int signal_length_samples, + int n_correlators) +{ + cudaSetDevice(selected_gps_device); + // cudaMemCpy version + //size_t memSize = signal_length_samples * sizeof(std::complex); + // input signal CPU -> GPU copy memory //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize, // cudaMemcpyHostToDevice, stream2); //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** //launch the multitap correlator with integrated local code resampler! - Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<>>( - d_corr_out, - d_sig_in, - d_sig_doppler_wiped, - d_local_codes_in, - d_shifts_chips, - d_code_length_chips, - code_phase_step_chips, - rem_code_phase_chips, - n_correlators, - signal_length_samples, - rem_carrier_phase_in_rad, - phase_step_rad - ); + Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<>>( + d_corr_out, + d_sig_in, + d_sig_doppler_wiped, + d_local_codes_in, + d_shifts_chips, + d_code_length_chips, + code_phase_step_chips, + rem_code_phase_chips, + n_correlators, + signal_length_samples, + rem_carrier_phase_in_rad, + phase_step_rad); - gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk( cudaStreamSynchronize(stream1)); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaStreamSynchronize(stream1)); - // cudaMemCpy version + // cudaMemCpy version // Copy the device result vector in device memory to the host result vector // in host memory. //scalar products (correlators outputs) @@ -352,37 +356,38 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( return true; } + cuda_multicorrelator::cuda_multicorrelator() { - d_sig_in=NULL; - d_nco_in=NULL; - d_sig_doppler_wiped=NULL; - d_local_codes_in=NULL; - d_shifts_samples=NULL; - d_shifts_chips=NULL; - d_corr_out=NULL; - threadsPerBlock=0; - blocksPerGrid=0; - d_code_length_chips=0; + d_sig_in = NULL; + d_nco_in = NULL; + d_sig_doppler_wiped = NULL; + d_local_codes_in = NULL; + d_shifts_samples = NULL; + d_shifts_chips = NULL; + d_corr_out = NULL; + threadsPerBlock = 0; + blocksPerGrid = 0; + d_code_length_chips = 0; } + bool cuda_multicorrelator::free_cuda() { - // Free device global memory - if (d_sig_in!=NULL) cudaFree(d_sig_in); - if (d_nco_in!=NULL) cudaFree(d_nco_in); - if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped); - if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in); - if (d_corr_out!=NULL) cudaFree(d_corr_out); - if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples); - if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips); + // Free device global memory + if (d_sig_in != NULL) cudaFree(d_sig_in); + if (d_nco_in != NULL) cudaFree(d_nco_in); + if (d_sig_doppler_wiped != NULL) cudaFree(d_sig_doppler_wiped); + if (d_local_codes_in != NULL) cudaFree(d_local_codes_in); + if (d_corr_out != NULL) cudaFree(d_corr_out); + if (d_shifts_samples != NULL) cudaFree(d_shifts_samples); + if (d_shifts_chips != NULL) cudaFree(d_shifts_chips); // Reset the device and exit // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits - cudaDeviceReset(); - return true; + cudaDeviceReset(); + return true; } - diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h index f75d72675..dad770a8a 100644 --- a/src/algorithms/tracking/libs/cuda_multicorrelator.h +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h @@ -1,11 +1,11 @@ /*! * \file cuda_multicorrelator.h - * \brief High optimized CUDA GPU vector multiTAP correlator class + * \brief Highly optimized CUDA GPU vector multiTAP correlator class * \authors
      *
    • Javier Arribas, 2015. jarribas(at)cttc.es *
    * - * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs + * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs * * ------------------------------------------------------------------------- * @@ -92,6 +92,7 @@ struct GPU_Complex } }; + struct GPU_Complex_Short { float r; @@ -149,7 +150,6 @@ private: GPU_Complex* d_local_codes_in; GPU_Complex* d_corr_out; - // std::complex* d_sig_in_cpu; std::complex* d_corr_out_cpu; diff --git a/src/algorithms/tracking/libs/tracking_discriminators.cc b/src/algorithms/tracking/libs/tracking_discriminators.cc index f1944ddc5..b6ab1490e 100644 --- a/src/algorithms/tracking/libs/tracking_discriminators.cc +++ b/src/algorithms/tracking/libs/tracking_discriminators.cc @@ -36,6 +36,7 @@ #include // All the outputs are in RADIANS + /* * FLL four quadrant arctan discriminator: * \f{equation} @@ -45,7 +46,6 @@ * \f$I_{PS1},Q_{PS1}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_1\f$, and * \f$I_{PS2},Q_{PS2}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_2\f$. The output is in [radians/second]. */ - double fll_four_quadrant_atan(gr_complex prompt_s1, gr_complex prompt_s2, double t1, double t2) { double cross, dot; @@ -105,6 +105,7 @@ double dll_nc_e_minus_l_normalized(gr_complex early_s1, gr_complex late_s1) return 0.5 * (P_early - P_late) / (P_early + P_late); } + /* * DLL Noncoherent Very Early Minus Late Power (VEMLP) normalized discriminator, using the outputs * of four correlators, Very Early (VE), Early (E), Late (L) and Very Late (VL):