diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator.cc b/src/algorithms/tracking/libs/cpu_multicorrelator.cc
index c201a5df7..53ee5056c 100644
--- a/src/algorithms/tracking/libs/cpu_multicorrelator.cc
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator.cc
@@ -1,11 +1,11 @@
/*!
* \file cpu_multicorrelator.cc
- * \brief High optimized CPU vector multiTAP correlator class
+ * \brief Highly optimized CPU vector multiTAP correlator class
* \authors
* - Javier Arribas, 2015. jarribas(at)cttc.es
*
*
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
*
* -------------------------------------------------------------------------
*
diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator.h b/src/algorithms/tracking/libs/cpu_multicorrelator.h
index d0f73d616..17c1179b6 100644
--- a/src/algorithms/tracking/libs/cpu_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator.h
@@ -65,4 +65,4 @@ private:
};
-#endif /* CPU_MULTICORRELATOR_H_ */
+#endif /* GNSS_SDR_CPU_MULTICORRELATOR_H_ */
diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
index 107a38f7d..82eda568c 100644
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.cc
@@ -1,11 +1,11 @@
/*!
* \file cpu_multicorrelator_16sc.cc
- * \brief High optimized CPU vector multiTAP correlator class
+ * \brief Highly optimized CPU vector multiTAP correlator class
* \authors
* - Javier Arribas, 2015. jarribas(at)cttc.es
*
*
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
*
* -------------------------------------------------------------------------
*
diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h
index 7ef9549e8..10c831462 100644
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_16sc.h
@@ -1,11 +1,11 @@
/*!
* \file cpu_multicorrelator_16sc.h
- * \brief High optimized CPU vector multiTAP correlator class for lv_16sc_t (short int complex)
+ * \brief Highly optimized CPU vector multiTAP correlator class for lv_16sc_t (short int complex)
* \authors
* - Javier Arribas, 2016. jarribas(at)cttc.es
*
*
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
*
* -------------------------------------------------------------------------
*
diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc
index 4bd70cde6..73219b4e2 100644
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.cc
@@ -6,7 +6,7 @@
* Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com
*
*
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
*
* -------------------------------------------------------------------------
*
diff --git a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h
index 48a94506d..83f083da3 100644
--- a/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator_real_codes.h
@@ -6,7 +6,7 @@
* Cillian O'Driscoll, 2017, cillian.odriscoll(at)gmail.com
*
*
- * Class that implements a high optimized vector multiTAP correlator class for CPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for CPUs
*
* -------------------------------------------------------------------------
*
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
index 38aefbcd5..27a46d973 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -1,11 +1,11 @@
/*!
* \file cuda_multicorrelator.cu
- * \brief High optimized CUDA GPU vector multiTAP correlator class
+ * \brief Highly optimized CUDA GPU vector multiTAP correlator class
* \authors
* - Javier Arribas, 2015. jarribas(at)cttc.es
*
*
- * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
*
* -------------------------------------------------------------------------
*
@@ -33,9 +33,8 @@
*/
#include "cuda_multicorrelator.h"
-
-#include
#include
+#include
// For the CUDA runtime routines (prefixed with "cuda_")
#include
@@ -53,22 +52,21 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
int vectorN,
int elementN,
float rem_carrier_phase_in_rad,
- float phase_step_rad
-)
+ float phase_step_rad)
{
//Accumulators cache
__shared__ GPU_Complex accumResult[ACCUM_N];
- // CUDA version of floating point NCO and vector dot product integrated
+ // CUDA version of floating point NCO and vector dot product integrated
float sin;
float cos;
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < elementN;
i += blockDim.x * gridDim.x)
- {
- __sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
- d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos,-sin);
- }
+ {
+ __sincosf(rem_carrier_phase_in_rad + i * phase_step_rad, &sin, &cos);
+ d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos, -sin);
+ }
__syncthreads();
////////////////////////////////////////////////////////////////////////////
@@ -77,273 +75,279 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
// from total number of thread blocks
////////////////////////////////////////////////////////////////////////////
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
- {
- //int vectorBase = IMUL(elementN, vec);
- //int vectorEnd = elementN;
-
- ////////////////////////////////////////////////////////////////////////
- // Each accumulator cycles through vectors with
- // stride equal to number of total number of accumulators ACCUM_N
- // At this stage ACCUM_N is only preferred be a multiple of warp size
- // to meet memory coalescing alignment constraints.
- ////////////////////////////////////////////////////////////////////////
- for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
{
- GPU_Complex sum = GPU_Complex(0,0);
- float local_code_chip_index=0.0;;
- //float code_phase;
- for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
- {
- //original sample code
- //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
- //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
- //sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
+ //int vectorBase = IMUL(elementN, vec);
+ //int vectorEnd = elementN;
- //custom code for multitap correlator
- // 1.resample local code for the current shift
+ ////////////////////////////////////////////////////////////////////////
+ // Each accumulator cycles through vectors with
+ // stride equal to number of total number of accumulators ACCUM_N
+ // At this stage ACCUM_N is only preferred be a multiple of warp size
+ // to meet memory coalescing alignment constraints.
+ ////////////////////////////////////////////////////////////////////////
+ for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+ {
+ GPU_Complex sum = GPU_Complex(0, 0);
+ float local_code_chip_index = 0.0;
+ ;
+ //float code_phase;
+ for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
+ {
+ //original sample code
+ //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
+ //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
+ //sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
- local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
+ //custom code for multitap correlator
+ // 1.resample local code for the current shift
- //Take into account that in multitap correlators, the shifts can be negative!
- if (local_code_chip_index<0.0) local_code_chip_index+=(code_length_chips-1);
- //printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
- // 2.correlate
- sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
+ local_code_chip_index = fmodf(code_phase_step_chips * __int2float_rd(pos) + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
- }
- accumResult[iAccum] = sum;
+ //Take into account that in multitap correlators, the shifts can be negative!
+ if (local_code_chip_index < 0.0) local_code_chip_index += (code_length_chips - 1);
+ //printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
+ // 2.correlate
+ sum.multiply_acc(d_sig_wiped[pos], d_local_code_in[__float2int_rd(local_code_chip_index)]);
+ }
+ accumResult[iAccum] = sum;
+ }
+
+ ////////////////////////////////////////////////////////////////////////
+ // Perform tree-like reduction of accumulators' results.
+ // ACCUM_N has to be power of two at this stage
+ ////////////////////////////////////////////////////////////////////////
+ for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+ {
+ __syncthreads();
+
+ for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+ {
+ accumResult[iAccum] += accumResult[stride + iAccum];
+ }
+ }
+
+ if (threadIdx.x == 0)
+ {
+ d_corr_out[vec] = accumResult[0];
+ }
}
-
- ////////////////////////////////////////////////////////////////////////
- // Perform tree-like reduction of accumulators' results.
- // ACCUM_N has to be power of two at this stage
- ////////////////////////////////////////////////////////////////////////
- for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
- {
- __syncthreads();
-
- for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
- {
- accumResult[iAccum] += accumResult[stride + iAccum];
- }
- }
-
- if (threadIdx.x == 0)
- {
- d_corr_out[vec] = accumResult[0];
- }
- }
}
+
bool cuda_multicorrelator::init_cuda_integrated_resampler(
- int signal_length_samples,
- int code_length_chips,
- int n_correlators
- )
+ int signal_length_samples,
+ int code_length_chips,
+ int n_correlators)
{
- // use command-line specified CUDA device, otherwise use device with highest Gflops/s
-// findCudaDevice(argc, (const char **)argv);
- cudaDeviceProp prop;
+ // use command-line specified CUDA device, otherwise use device with highest Gflops/s
+ // findCudaDevice(argc, (const char **)argv);
+ cudaDeviceProp prop;
int num_devices, device;
cudaGetDeviceCount(&num_devices);
- if (num_devices > 1) {
- int max_multiprocessors = 0, max_device = 0;
- for (device = 0; device < num_devices; device++) {
- cudaDeviceProp properties;
- cudaGetDeviceProperties(&properties, device);
- if (max_multiprocessors < properties.multiProcessorCount) {
- max_multiprocessors = properties.multiProcessorCount;
- max_device = device;
- }
- printf("Found GPU device # %i\n",device);
- }
- //cudaSetDevice(max_device);
+ if (num_devices > 1)
+ {
+ int max_multiprocessors = 0, max_device = 0;
+ for (device = 0; device < num_devices; device++)
+ {
+ cudaDeviceProp properties;
+ cudaGetDeviceProperties(&properties, device);
+ if (max_multiprocessors < properties.multiProcessorCount)
+ {
+ max_multiprocessors = properties.multiProcessorCount;
+ max_device = device;
+ }
+ printf("Found GPU device # %i\n", device);
+ }
+ //cudaSetDevice(max_device);
- //set random device!
- selected_gps_device=rand() % num_devices;//generates a random number between 0 and num_devices to split the threads between GPUs
- cudaSetDevice(selected_gps_device);
+ //set random device!
+ selected_gps_device = rand() % num_devices; //generates a random number between 0 and num_devices to split the threads between GPUs
+ cudaSetDevice(selected_gps_device);
- cudaGetDeviceProperties( &prop, max_device );
- //debug code
- if (prop.canMapHostMemory != 1) {
- printf( "Device can not map memory.\n" );
- }
- printf("L2 Cache size= %u \n",prop.l2CacheSize);
- printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
- printf("maxGridSize= %i \n",prop.maxGridSize[0]);
- printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
- printf("deviceOverlap= %i \n",prop.deviceOverlap);
- printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
- }else{
- cudaGetDevice( &selected_gps_device);
- cudaGetDeviceProperties( &prop, selected_gps_device );
- //debug code
- if (prop.canMapHostMemory != 1) {
- printf( "Device can not map memory.\n" );
- }
+ cudaGetDeviceProperties(&prop, max_device);
+ //debug code
+ if (prop.canMapHostMemory != 1)
+ {
+ printf("Device can not map memory.\n");
+ }
+ printf("L2 Cache size= %u \n", prop.l2CacheSize);
+ printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock);
+ printf("maxGridSize= %i \n", prop.maxGridSize[0]);
+ printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock);
+ printf("deviceOverlap= %i \n", prop.deviceOverlap);
+ printf("multiProcessorCount= %i \n", prop.multiProcessorCount);
+ }
+ else
+ {
+ cudaGetDevice(&selected_gps_device);
+ cudaGetDeviceProperties(&prop, selected_gps_device);
+ //debug code
+ if (prop.canMapHostMemory != 1)
+ {
+ printf("Device can not map memory.\n");
+ }
- printf("L2 Cache size= %u \n",prop.l2CacheSize);
- printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
- printf("maxGridSize= %i \n",prop.maxGridSize[0]);
- printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
- printf("deviceOverlap= %i \n",prop.deviceOverlap);
- printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
- }
+ printf("L2 Cache size= %u \n", prop.l2CacheSize);
+ printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock);
+ printf("maxGridSize= %i \n", prop.maxGridSize[0]);
+ printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock);
+ printf("deviceOverlap= %i \n", prop.deviceOverlap);
+ printf("multiProcessorCount= %i \n", prop.multiProcessorCount);
+ }
- // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
+ // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
size_t size = signal_length_samples * sizeof(GPU_Complex);
- //********* ZERO COPY VERSION ************
- // Set flag to enable zero copy access
+ //********* ZERO COPY VERSION ************
+ // Set flag to enable zero copy access
// Optimal in shared memory devices (like Jetson K1)
- //cudaSetDeviceFlags(cudaDeviceMapHost);
+ //cudaSetDeviceFlags(cudaDeviceMapHost);
- //******** CudaMalloc version ***********
+ //******** CudaMalloc version ***********
- // input signal GPU memory (can be mapped to CPU memory in shared memory devices!)
- // cudaMalloc((void **)&d_sig_in, size);
- // cudaMemset(d_sig_in,0,size);
+ // input signal GPU memory (can be mapped to CPU memory in shared memory devices!)
+ // cudaMalloc((void **)&d_sig_in, size);
+ // cudaMemset(d_sig_in,0,size);
- // Doppler-free signal (internal GPU memory)
- cudaMalloc((void **)&d_sig_doppler_wiped, size);
- cudaMemset(d_sig_doppler_wiped,0,size);
+ // Doppler-free signal (internal GPU memory)
+ cudaMalloc((void **)&d_sig_doppler_wiped, size);
+ cudaMemset(d_sig_doppler_wiped, 0, size);
- // Local code GPU memory (can be mapped to CPU memory in shared memory devices!)
- cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex)*code_length_chips);
- cudaMemset(d_local_codes_in,0,sizeof(std::complex)*code_length_chips);
+ // Local code GPU memory (can be mapped to CPU memory in shared memory devices!)
+ cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex) * code_length_chips);
+ cudaMemset(d_local_codes_in, 0, sizeof(std::complex) * code_length_chips);
- d_code_length_chips=code_length_chips;
+ d_code_length_chips = code_length_chips;
- // Vector with the chip shifts for each correlator tap
+ // Vector with the chip shifts for each correlator tap
//GPU memory (can be mapped to CPU memory in shared memory devices!)
- cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators);
- cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators);
+ cudaMalloc((void **)&d_shifts_chips, sizeof(float) * n_correlators);
+ cudaMemset(d_shifts_chips, 0, sizeof(float) * n_correlators);
- //scalars
- //cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators);
- //cudaMemset(d_corr_out,0,sizeof(std::complex)*n_correlators);
+ //scalars
+ //cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators);
+ //cudaMemset(d_corr_out,0,sizeof(std::complex)*n_correlators);
// Launch the Vector Add CUDA Kernel
// TODO: write a smart load balance using device info!
- threadsPerBlock = 64;
- blocksPerGrid = 128;//(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
+ threadsPerBlock = 64;
+ blocksPerGrid = 128; //(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
- cudaStreamCreate (&stream1) ;
- //cudaStreamCreate (&stream2) ;
- return true;
+ cudaStreamCreate(&stream1);
+ //cudaStreamCreate (&stream2) ;
+ return true;
}
+
bool cuda_multicorrelator::set_local_code_and_taps(
- int code_length_chips,
- const std::complex* local_codes_in,
- float *shifts_chips,
- int n_correlators
- )
+ int code_length_chips,
+ const std::complex *local_codes_in,
+ float *shifts_chips,
+ int n_correlators)
{
+ cudaSetDevice(selected_gps_device);
+ //********* ZERO COPY VERSION ************
+ // // Get device pointer from host memory. No allocation or memcpy
+ // cudaError_t code;
+ // // local code CPU -> GPU copy memory
+ // code=cudaHostGetDevicePointer((void **)&d_local_codes_in, (void *) local_codes_in, 0);
+ // if (code!=cudaSuccess)
+ // {
+ // printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
+ // }
+ // // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
+ // code=cudaHostGetDevicePointer((void **)&d_shifts_chips, (void *) shifts_chips, 0);
+ // if (code!=cudaSuccess)
+ // {
+ // printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
+ // }
- cudaSetDevice(selected_gps_device);
- //********* ZERO COPY VERSION ************
-// // Get device pointer from host memory. No allocation or memcpy
-// cudaError_t code;
-// // local code CPU -> GPU copy memory
-// code=cudaHostGetDevicePointer((void **)&d_local_codes_in, (void *) local_codes_in, 0);
-// if (code!=cudaSuccess)
-// {
-// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
-// }
-// // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
-// code=cudaHostGetDevicePointer((void **)&d_shifts_chips, (void *) shifts_chips, 0);
-// if (code!=cudaSuccess)
-// {
-// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
-// }
-
- //******** CudaMalloc version ***********
+ //******** CudaMalloc version ***********
//local code CPU -> GPU copy memory
- cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1);
- d_code_length_chips=code_length_chips;
+ cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex) * code_length_chips, cudaMemcpyHostToDevice, stream1);
+ d_code_length_chips = code_length_chips;
//Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
- cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
- cudaMemcpyHostToDevice,stream1);
+ cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float) * n_correlators,
+ cudaMemcpyHostToDevice, stream1);
- return true;
+ return true;
}
+
bool cuda_multicorrelator::set_input_output_vectors(
- std::complex* corr_out,
- std::complex* sig_in
- )
+ std::complex *corr_out,
+ std::complex *sig_in)
{
+ cudaSetDevice(selected_gps_device);
+ // Save CPU pointers
+ d_sig_in_cpu = sig_in;
+ d_corr_out_cpu = corr_out;
- cudaSetDevice(selected_gps_device);
- // Save CPU pointers
- d_sig_in_cpu =sig_in;
- d_corr_out_cpu = corr_out;
-
- // Zero Copy version
- // Get device pointer from host memory. No allocation or memcpy
- cudaError_t code;
- code=cudaHostGetDevicePointer((void **)&d_sig_in, (void *) sig_in, 0);
- code=cudaHostGetDevicePointer((void **)&d_corr_out, (void *) corr_out, 0);
- if (code!=cudaSuccess)
- {
- printf("cuda cudaHostGetDevicePointer error \r\n");
- }
- return true;
-
+ // Zero Copy version
+ // Get device pointer from host memory. No allocation or memcpy
+ cudaError_t code;
+ code = cudaHostGetDevicePointer((void **)&d_sig_in, (void *)sig_in, 0);
+ code = cudaHostGetDevicePointer((void **)&d_corr_out, (void *)corr_out, 0);
+ if (code != cudaSuccess)
+ {
+ printf("cuda cudaHostGetDevicePointer error \r\n");
+ }
+ return true;
}
-#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+#define gpuErrchk(ans) \
+ { \
+ gpuAssert((ans), __FILE__, __LINE__); \
+ }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
- if (code != cudaSuccess)
- {
- fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
- if (abort) exit(code);
- }
+ if (code != cudaSuccess)
+ {
+ fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+ if (abort) exit(code);
+ }
}
+
bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
- float rem_carrier_phase_in_rad,
- float phase_step_rad,
- float code_phase_step_chips,
- float rem_code_phase_chips,
- int signal_length_samples,
- int n_correlators)
- {
-
- cudaSetDevice(selected_gps_device);
- // cudaMemCpy version
- //size_t memSize = signal_length_samples * sizeof(std::complex);
- // input signal CPU -> GPU copy memory
+ float rem_carrier_phase_in_rad,
+ float phase_step_rad,
+ float code_phase_step_chips,
+ float rem_code_phase_chips,
+ int signal_length_samples,
+ int n_correlators)
+{
+ cudaSetDevice(selected_gps_device);
+ // cudaMemCpy version
+ //size_t memSize = signal_length_samples * sizeof(std::complex);
+ // input signal CPU -> GPU copy memory
//cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize,
// cudaMemcpyHostToDevice, stream2);
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
//launch the multitap correlator with integrated local code resampler!
- Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<>>(
- d_corr_out,
- d_sig_in,
- d_sig_doppler_wiped,
- d_local_codes_in,
- d_shifts_chips,
- d_code_length_chips,
- code_phase_step_chips,
- rem_code_phase_chips,
- n_correlators,
- signal_length_samples,
- rem_carrier_phase_in_rad,
- phase_step_rad
- );
+ Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<>>(
+ d_corr_out,
+ d_sig_in,
+ d_sig_doppler_wiped,
+ d_local_codes_in,
+ d_shifts_chips,
+ d_code_length_chips,
+ code_phase_step_chips,
+ rem_code_phase_chips,
+ n_correlators,
+ signal_length_samples,
+ rem_carrier_phase_in_rad,
+ phase_step_rad);
- gpuErrchk( cudaPeekAtLastError() );
- gpuErrchk( cudaStreamSynchronize(stream1));
+ gpuErrchk(cudaPeekAtLastError());
+ gpuErrchk(cudaStreamSynchronize(stream1));
- // cudaMemCpy version
+ // cudaMemCpy version
// Copy the device result vector in device memory to the host result vector
// in host memory.
//scalar products (correlators outputs)
@@ -352,37 +356,38 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
return true;
}
+
cuda_multicorrelator::cuda_multicorrelator()
{
- d_sig_in=NULL;
- d_nco_in=NULL;
- d_sig_doppler_wiped=NULL;
- d_local_codes_in=NULL;
- d_shifts_samples=NULL;
- d_shifts_chips=NULL;
- d_corr_out=NULL;
- threadsPerBlock=0;
- blocksPerGrid=0;
- d_code_length_chips=0;
+ d_sig_in = NULL;
+ d_nco_in = NULL;
+ d_sig_doppler_wiped = NULL;
+ d_local_codes_in = NULL;
+ d_shifts_samples = NULL;
+ d_shifts_chips = NULL;
+ d_corr_out = NULL;
+ threadsPerBlock = 0;
+ blocksPerGrid = 0;
+ d_code_length_chips = 0;
}
+
bool cuda_multicorrelator::free_cuda()
{
- // Free device global memory
- if (d_sig_in!=NULL) cudaFree(d_sig_in);
- if (d_nco_in!=NULL) cudaFree(d_nco_in);
- if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped);
- if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in);
- if (d_corr_out!=NULL) cudaFree(d_corr_out);
- if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples);
- if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips);
+ // Free device global memory
+ if (d_sig_in != NULL) cudaFree(d_sig_in);
+ if (d_nco_in != NULL) cudaFree(d_nco_in);
+ if (d_sig_doppler_wiped != NULL) cudaFree(d_sig_doppler_wiped);
+ if (d_local_codes_in != NULL) cudaFree(d_local_codes_in);
+ if (d_corr_out != NULL) cudaFree(d_corr_out);
+ if (d_shifts_samples != NULL) cudaFree(d_shifts_samples);
+ if (d_shifts_chips != NULL) cudaFree(d_shifts_chips);
// Reset the device and exit
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
- cudaDeviceReset();
- return true;
+ cudaDeviceReset();
+ return true;
}
-
diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h
index f75d72675..dad770a8a 100644
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -1,11 +1,11 @@
/*!
* \file cuda_multicorrelator.h
- * \brief High optimized CUDA GPU vector multiTAP correlator class
+ * \brief Highly optimized CUDA GPU vector multiTAP correlator class
* \authors
* - Javier Arribas, 2015. jarribas(at)cttc.es
*
*
- * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
+ * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
*
* -------------------------------------------------------------------------
*
@@ -92,6 +92,7 @@ struct GPU_Complex
}
};
+
struct GPU_Complex_Short
{
float r;
@@ -149,7 +150,6 @@ private:
GPU_Complex* d_local_codes_in;
GPU_Complex* d_corr_out;
- //
std::complex* d_sig_in_cpu;
std::complex* d_corr_out_cpu;
diff --git a/src/algorithms/tracking/libs/tracking_discriminators.cc b/src/algorithms/tracking/libs/tracking_discriminators.cc
index f1944ddc5..b6ab1490e 100644
--- a/src/algorithms/tracking/libs/tracking_discriminators.cc
+++ b/src/algorithms/tracking/libs/tracking_discriminators.cc
@@ -36,6 +36,7 @@
#include
// All the outputs are in RADIANS
+
/*
* FLL four quadrant arctan discriminator:
* \f{equation}
@@ -45,7 +46,6 @@
* \f$I_{PS1},Q_{PS1}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_1\f$, and
* \f$I_{PS2},Q_{PS2}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_2\f$. The output is in [radians/second].
*/
-
double fll_four_quadrant_atan(gr_complex prompt_s1, gr_complex prompt_s2, double t1, double t2)
{
double cross, dot;
@@ -105,6 +105,7 @@ double dll_nc_e_minus_l_normalized(gr_complex early_s1, gr_complex late_s1)
return 0.5 * (P_early - P_late) / (P_early + P_late);
}
+
/*
* DLL Noncoherent Very Early Minus Late Power (VEMLP) normalized discriminator, using the outputs
* of four correlators, Very Early (VE), Early (E), Late (L) and Very Late (VL):