mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2025-01-30 10:54:50 +00:00
Removed CheckCudaErrors function and the cuda helpers dependencies in
CUDA accelerators
This commit is contained in:
parent
7b57bd28f8
commit
6aef3478cf
@ -48,7 +48,7 @@ include_directories(
|
|||||||
${GNURADIO_RUNTIME_INCLUDE_DIRS}
|
${GNURADIO_RUNTIME_INCLUDE_DIRS}
|
||||||
${VOLK_GNSSSDR_INCLUDE_DIRS}
|
${VOLK_GNSSSDR_INCLUDE_DIRS}
|
||||||
${CUDA_INCLUDE_DIRS}
|
${CUDA_INCLUDE_DIRS}
|
||||||
${CMAKE_SOURCE_DIR}/src/algorithms/tracking/libs/cudahelpers
|
# ${CMAKE_SOURCE_DIR}/src/algorithms/tracking/libs/cudahelpers
|
||||||
)
|
)
|
||||||
|
|
||||||
if(ENABLE_GENERIC_ARCH)
|
if(ENABLE_GENERIC_ARCH)
|
||||||
|
@ -50,8 +50,7 @@
|
|||||||
#include <volk/volk.h> //volk_alignement
|
#include <volk/volk.h> //volk_alignement
|
||||||
// includes
|
// includes
|
||||||
#include <cuda_profiler_api.h>
|
#include <cuda_profiler_api.h>
|
||||||
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
|
|
||||||
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \todo Include in definition header file
|
* \todo Include in definition header file
|
||||||
@ -131,21 +130,21 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
|
|||||||
multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS);
|
multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS);
|
||||||
|
|
||||||
// Get space for the resampled early / prompt / late local replicas
|
// Get space for the resampled early / prompt / late local replicas
|
||||||
checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float), cudaHostAllocMapped ));
|
cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float), cudaHostAllocMapped );
|
||||||
|
|
||||||
|
|
||||||
//allocate host memory
|
//allocate host memory
|
||||||
//pinned memory mode - use special function to get OS-pinned memory
|
//pinned memory mode - use special function to get OS-pinned memory
|
||||||
checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length * sizeof(gr_complex), cudaHostAllocMapped ));
|
cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length * sizeof(gr_complex), cudaHostAllocMapped );
|
||||||
|
|
||||||
//old local codes vector
|
//old local codes vector
|
||||||
//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
|
// (cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
|
||||||
|
|
||||||
//new integrated shifts
|
//new integrated shifts
|
||||||
//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
|
// (cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
|
||||||
|
|
||||||
// correlator outputs (scalar)
|
// correlator outputs (scalar)
|
||||||
checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS, cudaHostAllocWriteCombined ));
|
cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS, cudaHostAllocWriteCombined );
|
||||||
//map to EPL pointers
|
//map to EPL pointers
|
||||||
d_Early = &d_corr_outs_gpu[0];
|
d_Early = &d_corr_outs_gpu[0];
|
||||||
d_Prompt = &d_corr_outs_gpu[1];
|
d_Prompt = &d_corr_outs_gpu[1];
|
||||||
|
@ -28,7 +28,7 @@ if(ENABLE_CUDA)
|
|||||||
|
|
||||||
CUDA_INCLUDE_DIRECTORIES(
|
CUDA_INCLUDE_DIRECTORIES(
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}
|
${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers
|
#${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers
|
||||||
)
|
)
|
||||||
|
|
||||||
SET(LIB_TYPE STATIC) #set the lib type
|
SET(LIB_TYPE STATIC) #set the lib type
|
||||||
|
@ -49,9 +49,6 @@
|
|||||||
// For the CUDA runtime routines (prefixed with "cuda_")
|
// For the CUDA runtime routines (prefixed with "cuda_")
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
|
||||||
#include <helper_cuda.h>
|
|
||||||
#include <helper_functions.h>
|
|
||||||
|
|
||||||
#define ACCUM_N 256
|
#define ACCUM_N 256
|
||||||
|
|
||||||
@ -224,7 +221,6 @@ __global__ void scalarProdGPUCPXxN(
|
|||||||
//int vectorBase = IMUL(elementN, vec);
|
//int vectorBase = IMUL(elementN, vec);
|
||||||
//int vectorEnd = vectorBase + elementN;
|
//int vectorEnd = vectorBase + elementN;
|
||||||
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Each accumulator cycles through vectors with
|
// Each accumulator cycles through vectors with
|
||||||
// stride equal to number of total number of accumulators ACCUM_N
|
// stride equal to number of total number of accumulators ACCUM_N
|
||||||
@ -392,28 +388,28 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign
|
|||||||
// printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
|
// printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
|
// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
|
||||||
|
|
||||||
|
|
||||||
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
|
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
|
||||||
|
|
||||||
size_t size = signal_length_samples * sizeof(GPU_Complex);
|
size_t size = signal_length_samples * sizeof(GPU_Complex);
|
||||||
|
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
|
cudaMalloc((void **)&d_sig_in, size);
|
||||||
//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
|
// (cudaMalloc((void **)&d_nco_in, size));
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
|
cudaMalloc((void **)&d_sig_doppler_wiped, size);
|
||||||
|
|
||||||
// old version: all local codes are independent vectors
|
// old version: all local codes are independent vectors
|
||||||
//checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
|
// (cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
|
||||||
|
|
||||||
// new version: only one vector with extra samples to shift the local code for the correlator set
|
// new version: only one vector with extra samples to shift the local code for the correlator set
|
||||||
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
||||||
size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex);
|
size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex);
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes));
|
cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes);
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators));
|
cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators);
|
||||||
|
|
||||||
//scalars
|
//scalars
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
|
cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
|
||||||
|
|
||||||
// Launch the Vector Add CUDA Kernel
|
// Launch the Vector Add CUDA Kernel
|
||||||
threadsPerBlock = 256;
|
threadsPerBlock = 256;
|
||||||
@ -481,30 +477,30 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler(
|
|||||||
// printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
|
// printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
|
// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
|
||||||
|
|
||||||
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
|
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
|
||||||
|
|
||||||
size_t size = signal_length_samples * sizeof(GPU_Complex);
|
size_t size = signal_length_samples * sizeof(GPU_Complex);
|
||||||
|
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
|
cudaMalloc((void **)&d_sig_in, size);
|
||||||
checkCudaErrors(cudaMemset(d_sig_in,0,size));
|
cudaMemset(d_sig_in,0,size);
|
||||||
|
|
||||||
//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
|
// (cudaMalloc((void **)&d_nco_in, size));
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
|
cudaMalloc((void **)&d_sig_doppler_wiped, size);
|
||||||
checkCudaErrors(cudaMemset(d_sig_doppler_wiped,0,size));
|
cudaMemset(d_sig_doppler_wiped,0,size);
|
||||||
|
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips));
|
cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips);
|
||||||
checkCudaErrors(cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips));
|
cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips);
|
||||||
|
|
||||||
d_code_length_chips=code_length_chips;
|
d_code_length_chips=code_length_chips;
|
||||||
|
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators));
|
cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators);
|
||||||
checkCudaErrors(cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators));
|
cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators);
|
||||||
|
|
||||||
//scalars
|
//scalars
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
|
cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
|
||||||
checkCudaErrors(cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators));
|
cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
|
||||||
|
|
||||||
// Launch the Vector Add CUDA Kernel
|
// Launch the Vector Add CUDA Kernel
|
||||||
threadsPerBlock = 256;
|
threadsPerBlock = 256;
|
||||||
@ -523,12 +519,12 @@ bool cuda_multicorrelator::set_local_code_and_taps(
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
// local code CPU -> GPU copy memory
|
// local code CPU -> GPU copy memory
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1));
|
cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1);
|
||||||
d_code_length_chips=(float)code_length_chips;
|
d_code_length_chips=(float)code_length_chips;
|
||||||
|
|
||||||
// Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
|
// Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
|
cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
|
||||||
cudaMemcpyHostToDevice,stream1));
|
cudaMemcpyHostToDevice,stream1);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@ -550,40 +546,40 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
|
|||||||
|
|
||||||
// input signal CPU -> GPU copy memory
|
// input signal CPU -> GPU copy memory
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
|
cudaMemcpyAsync(d_sig_in, sig_in, memSize,
|
||||||
cudaMemcpyHostToDevice, stream1));
|
cudaMemcpyHostToDevice, stream1);
|
||||||
|
|
||||||
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
|
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
|
||||||
//checkCudaErrors(cudaMemcpyAsync(d_nco_in, nco_in, memSize,
|
// (cudaMemcpyAsync(d_nco_in, nco_in, memSize,
|
||||||
// cudaMemcpyHostToDevice, stream1));
|
// cudaMemcpyHostToDevice, stream1));
|
||||||
|
|
||||||
|
|
||||||
// old version: all local codes are independent vectors
|
// old version: all local codes are independent vectors
|
||||||
//checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
|
// (cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
|
||||||
// cudaMemcpyHostToDevice, stream2));
|
// cudaMemcpyHostToDevice, stream2));
|
||||||
|
|
||||||
// new version: only one vector with extra samples to shift the local code for the correlator set
|
// new version: only one vector with extra samples to shift the local code for the correlator set
|
||||||
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
||||||
|
|
||||||
// local code CPU -> GPU copy memory
|
// local code CPU -> GPU copy memory
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
|
cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
|
||||||
cudaMemcpyHostToDevice, stream2));
|
cudaMemcpyHostToDevice, stream2);
|
||||||
// Correlator shifts vector CPU -> GPU copy memory
|
// Correlator shifts vector CPU -> GPU copy memory
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
|
cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
|
||||||
cudaMemcpyHostToDevice, stream2));
|
cudaMemcpyHostToDevice, stream2);
|
||||||
|
|
||||||
|
|
||||||
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
|
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream1));
|
cudaStreamSynchronize(stream1);
|
||||||
CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
|
CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
|
||||||
|
|
||||||
|
|
||||||
//printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
|
//printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
|
||||||
|
|
||||||
//wait for Doppler wipeoff end...
|
//wait for Doppler wipeoff end...
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream1));
|
cudaStreamSynchronize(stream1);
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream2));
|
cudaStreamSynchronize(stream2);
|
||||||
//checkCudaErrors(cudaDeviceSynchronize());
|
// (cudaDeviceSynchronize());
|
||||||
|
|
||||||
//old
|
//old
|
||||||
// scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
|
// scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
|
||||||
@ -604,15 +600,15 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
|
|||||||
n_correlators,
|
n_correlators,
|
||||||
signal_length_samples
|
signal_length_samples
|
||||||
);
|
);
|
||||||
checkCudaErrors(cudaGetLastError());
|
cudaGetLastError();
|
||||||
//wait for correlators end...
|
//wait for correlators end...
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream2));
|
cudaStreamSynchronize(stream2);
|
||||||
// Copy the device result vector in device memory to the host result vector
|
// Copy the device result vector in device memory to the host result vector
|
||||||
// in host memory.
|
// in host memory.
|
||||||
|
|
||||||
//scalar products (correlators outputs)
|
//scalar products (correlators outputs)
|
||||||
checkCudaErrors(cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
|
cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
|
||||||
cudaMemcpyDeviceToHost));
|
cudaMemcpyDeviceToHost);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -629,19 +625,19 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
|
|||||||
|
|
||||||
size_t memSize = signal_length_samples * sizeof(std::complex<float>);
|
size_t memSize = signal_length_samples * sizeof(std::complex<float>);
|
||||||
// input signal CPU -> GPU copy memory
|
// input signal CPU -> GPU copy memory
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
|
cudaMemcpyAsync(d_sig_in, sig_in, memSize,
|
||||||
cudaMemcpyHostToDevice, stream2));
|
cudaMemcpyHostToDevice, stream2);
|
||||||
|
|
||||||
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
|
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
|
||||||
|
|
||||||
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
|
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream2));
|
cudaStreamSynchronize(stream2);
|
||||||
|
|
||||||
CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
|
CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
|
||||||
|
|
||||||
//wait for Doppler wipeoff end...
|
//wait for Doppler wipeoff end...
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream1));
|
cudaStreamSynchronize(stream1);
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream2));
|
cudaStreamSynchronize(stream2);
|
||||||
|
|
||||||
//launch the multitap correlator with integrated local code resampler!
|
//launch the multitap correlator with integrated local code resampler!
|
||||||
|
|
||||||
@ -657,16 +653,16 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
|
|||||||
signal_length_samples
|
signal_length_samples
|
||||||
);
|
);
|
||||||
|
|
||||||
checkCudaErrors(cudaGetLastError());
|
cudaGetLastError();
|
||||||
//wait for correlators end...
|
//wait for correlators end...
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream1));
|
cudaStreamSynchronize(stream1);
|
||||||
// Copy the device result vector in device memory to the host result vector
|
// Copy the device result vector in device memory to the host result vector
|
||||||
// in host memory.
|
// in host memory.
|
||||||
|
|
||||||
//scalar products (correlators outputs)
|
//scalar products (correlators outputs)
|
||||||
checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
|
cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
|
||||||
cudaMemcpyDeviceToHost,stream1));
|
cudaMemcpyDeviceToHost,stream1);
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream1));
|
cudaStreamSynchronize(stream1);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -708,7 +704,7 @@ bool cuda_multicorrelator::free_cuda()
|
|||||||
// needed to ensure correct operation when the application is being
|
// needed to ensure correct operation when the application is being
|
||||||
// profiled. Calling cudaDeviceReset causes all profile data to be
|
// profiled. Calling cudaDeviceReset causes all profile data to be
|
||||||
// flushed before the application exits
|
// flushed before the application exits
|
||||||
//checkCudaErrors(cudaDeviceReset());
|
// (cudaDeviceReset());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,5 +167,4 @@ private:
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#endif /* CUDA_MULTICORRELATOR_H_ */
|
#endif /* CUDA_MULTICORRELATOR_H_ */
|
||||||
|
Loading…
Reference in New Issue
Block a user