diff --git a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt index a018fe10e..6af7fb317 100644 --- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt +++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt @@ -48,7 +48,7 @@ include_directories( ${GNURADIO_RUNTIME_INCLUDE_DIRS} ${VOLK_GNSSSDR_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS} - ${CMAKE_SOURCE_DIR}/src/algorithms/tracking/libs/cudahelpers + # ${CMAKE_SOURCE_DIR}/src/algorithms/tracking/libs/cudahelpers ) if(ENABLE_GENERIC_ARCH) diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc index 1cf5d038e..d16a0c6d1 100644 --- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc @@ -50,8 +50,7 @@ #include //volk_alignement // includes #include -#include // helper for shared functions common to CUDA Samples -#include // helper functions for CUDA error checking and initialization + /*! * \todo Include in definition header file @@ -131,21 +130,21 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc( multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS); // Get space for the resampled early / prompt / late local replicas - checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float), cudaHostAllocMapped )); + cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float), cudaHostAllocMapped ); //allocate host memory //pinned memory mode - use special function to get OS-pinned memory - checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length * sizeof(gr_complex), cudaHostAllocMapped )); + cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length * sizeof(gr_complex), cudaHostAllocMapped ); //old local codes vector - //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined )); + // (cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined )); //new integrated shifts - //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined )); + // (cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined )); // correlator outputs (scalar) - checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS, cudaHostAllocWriteCombined )); + cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS, cudaHostAllocWriteCombined ); //map to EPL pointers d_Early = &d_corr_outs_gpu[0]; d_Prompt = &d_corr_outs_gpu[1]; diff --git a/src/algorithms/tracking/libs/CMakeLists.txt b/src/algorithms/tracking/libs/CMakeLists.txt index e6f66fa68..c6c8c8c99 100644 --- a/src/algorithms/tracking/libs/CMakeLists.txt +++ b/src/algorithms/tracking/libs/CMakeLists.txt @@ -28,7 +28,7 @@ if(ENABLE_CUDA) CUDA_INCLUDE_DIRECTORIES( ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers + #${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers ) SET(LIB_TYPE STATIC) #set the lib type diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu index 166bca3c9..43ffeed00 100644 --- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu @@ -49,9 +49,6 @@ // For the CUDA runtime routines (prefixed with "cuda_") #include -// helper functions and utilities to work with CUDA -#include -#include #define ACCUM_N 256 @@ -224,7 +221,6 @@ __global__ void scalarProdGPUCPXxN( //int vectorBase = IMUL(elementN, vec); //int vectorEnd = vectorBase + elementN; - //////////////////////////////////////////////////////////////////////// // Each accumulator cycles through vectors with // stride equal to number of total number of accumulators ACCUM_N @@ -392,28 +388,28 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign // printf("multiProcessorCount= %i \n",prop.multiProcessorCount); // } - //checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); + // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors size_t size = signal_length_samples * sizeof(GPU_Complex); - checkCudaErrors(cudaMalloc((void **)&d_sig_in, size)); - //checkCudaErrors(cudaMalloc((void **)&d_nco_in, size)); - checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size)); + cudaMalloc((void **)&d_sig_in, size); + // (cudaMalloc((void **)&d_nco_in, size)); + cudaMalloc((void **)&d_sig_doppler_wiped, size); // old version: all local codes are independent vectors - //checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size*n_correlators)); + // (cudaMalloc((void **)&d_local_codes_in, size*n_correlators)); // new version: only one vector with extra samples to shift the local code for the correlator set // Required: The last correlator tap in d_shifts_samples has the largest sample shift size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex); - checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators)); + cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes); + cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators); //scalars - checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators)); + cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators); // Launch the Vector Add CUDA Kernel threadsPerBlock = 256; @@ -481,30 +477,30 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler( // printf("multiProcessorCount= %i \n",prop.multiProcessorCount); // } - //checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); + // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors size_t size = signal_length_samples * sizeof(GPU_Complex); - checkCudaErrors(cudaMalloc((void **)&d_sig_in, size)); - checkCudaErrors(cudaMemset(d_sig_in,0,size)); + cudaMalloc((void **)&d_sig_in, size); + cudaMemset(d_sig_in,0,size); - //checkCudaErrors(cudaMalloc((void **)&d_nco_in, size)); - checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size)); - checkCudaErrors(cudaMemset(d_sig_doppler_wiped,0,size)); + // (cudaMalloc((void **)&d_nco_in, size)); + cudaMalloc((void **)&d_sig_doppler_wiped, size); + cudaMemset(d_sig_doppler_wiped,0,size); - checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex)*code_length_chips)); - checkCudaErrors(cudaMemset(d_local_codes_in,0,sizeof(std::complex)*code_length_chips)); + cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex)*code_length_chips); + cudaMemset(d_local_codes_in,0,sizeof(std::complex)*code_length_chips); d_code_length_chips=code_length_chips; - checkCudaErrors(cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators)); - checkCudaErrors(cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators)); + cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators); + cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators); //scalars - checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators)); - checkCudaErrors(cudaMemset(d_corr_out,0,sizeof(std::complex)*n_correlators)); + cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators); + cudaMemset(d_corr_out,0,sizeof(std::complex)*n_correlators); // Launch the Vector Add CUDA Kernel threadsPerBlock = 256; @@ -523,12 +519,12 @@ bool cuda_multicorrelator::set_local_code_and_taps( ) { // local code CPU -> GPU copy memory - checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1)); + cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1); d_code_length_chips=(float)code_length_chips; // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) - checkCudaErrors(cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators, - cudaMemcpyHostToDevice,stream1)); + cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators, + cudaMemcpyHostToDevice,stream1); return true; } @@ -550,40 +546,40 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( // input signal CPU -> GPU copy memory - checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize, - cudaMemcpyHostToDevice, stream1)); + cudaMemcpyAsync(d_sig_in, sig_in, memSize, + cudaMemcpyHostToDevice, stream1); //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** - //checkCudaErrors(cudaMemcpyAsync(d_nco_in, nco_in, memSize, + // (cudaMemcpyAsync(d_nco_in, nco_in, memSize, // cudaMemcpyHostToDevice, stream1)); // old version: all local codes are independent vectors - //checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators, + // (cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators, // cudaMemcpyHostToDevice, stream2)); // new version: only one vector with extra samples to shift the local code for the correlator set // Required: The last correlator tap in d_shifts_samples has the largest sample shift // local code CPU -> GPU copy memory - checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex)*shifts_samples[n_correlators-1], - cudaMemcpyHostToDevice, stream2)); + cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex)*shifts_samples[n_correlators-1], + cudaMemcpyHostToDevice, stream2); // Correlator shifts vector CPU -> GPU copy memory - checkCudaErrors(cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators, - cudaMemcpyHostToDevice, stream2)); + cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators, + cudaMemcpyHostToDevice, stream2); //Launch carrier wipe-off kernel here, while local codes are being copied to GPU! - checkCudaErrors(cudaStreamSynchronize(stream1)); + cudaStreamSynchronize(stream1); CUDA_32fc_Doppler_wipeoff<<>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples); //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); //wait for Doppler wipeoff end... - checkCudaErrors(cudaStreamSynchronize(stream1)); - checkCudaErrors(cudaStreamSynchronize(stream2)); - //checkCudaErrors(cudaDeviceSynchronize()); + cudaStreamSynchronize(stream1); + cudaStreamSynchronize(stream2); + // (cudaDeviceSynchronize()); //old // scalarProdGPUCPXxN<<>>( @@ -604,15 +600,15 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( n_correlators, signal_length_samples ); - checkCudaErrors(cudaGetLastError()); + cudaGetLastError(); //wait for correlators end... - checkCudaErrors(cudaStreamSynchronize(stream2)); + cudaStreamSynchronize(stream2); // Copy the device result vector in device memory to the host result vector // in host memory. //scalar products (correlators outputs) - checkCudaErrors(cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex)*n_correlators, - cudaMemcpyDeviceToHost)); + cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex)*n_correlators, + cudaMemcpyDeviceToHost); return true; } @@ -629,19 +625,19 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( size_t memSize = signal_length_samples * sizeof(std::complex); // input signal CPU -> GPU copy memory - checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize, - cudaMemcpyHostToDevice, stream2)); + cudaMemcpyAsync(d_sig_in, sig_in, memSize, + cudaMemcpyHostToDevice, stream2); //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** //Launch carrier wipe-off kernel here, while local codes are being copied to GPU! - checkCudaErrors(cudaStreamSynchronize(stream2)); + cudaStreamSynchronize(stream2); CUDA_32fc_Doppler_wipeoff<<>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples); //wait for Doppler wipeoff end... - checkCudaErrors(cudaStreamSynchronize(stream1)); - checkCudaErrors(cudaStreamSynchronize(stream2)); + cudaStreamSynchronize(stream1); + cudaStreamSynchronize(stream2); //launch the multitap correlator with integrated local code resampler! @@ -657,16 +653,16 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( signal_length_samples ); - checkCudaErrors(cudaGetLastError()); + cudaGetLastError(); //wait for correlators end... - checkCudaErrors(cudaStreamSynchronize(stream1)); + cudaStreamSynchronize(stream1); // Copy the device result vector in device memory to the host result vector // in host memory. //scalar products (correlators outputs) - checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex)*n_correlators, - cudaMemcpyDeviceToHost,stream1)); - checkCudaErrors(cudaStreamSynchronize(stream1)); + cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex)*n_correlators, + cudaMemcpyDeviceToHost,stream1); + cudaStreamSynchronize(stream1); return true; } @@ -708,7 +704,7 @@ bool cuda_multicorrelator::free_cuda() // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits - //checkCudaErrors(cudaDeviceReset()); + // (cudaDeviceReset()); return true; } diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h index 97594e5f4..72fa8db66 100644 --- a/src/algorithms/tracking/libs/cuda_multicorrelator.h +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h @@ -167,5 +167,4 @@ private: }; - #endif /* CUDA_MULTICORRELATOR_H_ */