1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-12-14 20:20:35 +00:00
This commit is contained in:
Carles Fernandez 2019-03-19 20:39:23 +01:00
parent c8d27eb97c
commit 10d73da839
No known key found for this signature in database
GPG Key ID: 4C583C52B0C3877D
9 changed files with 257 additions and 251 deletions

View File

@ -1,11 +1,11 @@
/*! /*!
* \file cpu_multicorrelator.cc * \file cpu_multicorrelator.cc
* \brief High optimized CPU vector multiTAP correlator class * \brief Highly optimized CPU vector multiTAP correlator class
* \authors <ul> * \authors <ul>
* <li> Javier Arribas, 2015. jarribas(at)cttc.es * <li> Javier Arribas, 2015. jarribas(at)cttc.es
* </ul> * </ul>
* *
* Class that implements a high optimized vector multiTAP correlator class for CPUs * Class that implements a highly optimized vector multiTAP correlator class for CPUs
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
* *

View File

@ -65,4 +65,4 @@ private:
}; };
#endif /* CPU_MULTICORRELATOR_H_ */ #endif /* GNSS_SDR_CPU_MULTICORRELATOR_H_ */

View File

@ -1,11 +1,11 @@
/*! /*!
* \file cpu_multicorrelator_16sc.cc * \file cpu_multicorrelator_16sc.cc
* \brief High optimized CPU vector multiTAP correlator class * \brief Highly optimized CPU vector multiTAP correlator class
* \authors <ul> * \authors <ul>
* <li> Javier Arribas, 2015. jarribas(at)cttc.es * <li> Javier Arribas, 2015. jarribas(at)cttc.es
* </ul> * </ul>
* *
* Class that implements a high optimized vector multiTAP correlator class for CPUs * Class that implements a highly optimized vector multiTAP correlator class for CPUs
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
* *

View File

@ -1,11 +1,11 @@
/*! /*!
* \file cpu_multicorrelator_16sc.h * \file cpu_multicorrelator_16sc.h
* \brief High optimized CPU vector multiTAP correlator class for lv_16sc_t (short int complex) * \brief Highly optimized CPU vector multiTAP correlator class for lv_16sc_t (short int complex)
* \authors <ul> * \authors <ul>
* <li> Javier Arribas, 2016. jarribas(at)cttc.es * <li> Javier Arribas, 2016. jarribas(at)cttc.es
* </ul> * </ul>
* *
* Class that implements a high optimized vector multiTAP correlator class for CPUs * Class that implements a highly optimized vector multiTAP correlator class for CPUs
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
* *

View File

@ -6,7 +6,7 @@
* <li> Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com * <li> Cillian O'Driscoll, 2017. cillian.odriscoll(at)gmail.com
* </ul> * </ul>
* *
* Class that implements a high optimized vector multiTAP correlator class for CPUs * Class that implements a highly optimized vector multiTAP correlator class for CPUs
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
* *

View File

@ -6,7 +6,7 @@
* <li> Cillian O'Driscoll, 2017, cillian.odriscoll(at)gmail.com * <li> Cillian O'Driscoll, 2017, cillian.odriscoll(at)gmail.com
* </ul> * </ul>
* *
* Class that implements a high optimized vector multiTAP correlator class for CPUs * Class that implements a highly optimized vector multiTAP correlator class for CPUs
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
* *

View File

@ -1,11 +1,11 @@
/*! /*!
* \file cuda_multicorrelator.cu * \file cuda_multicorrelator.cu
* \brief High optimized CUDA GPU vector multiTAP correlator class * \brief Highly optimized CUDA GPU vector multiTAP correlator class
* \authors <ul> * \authors <ul>
* <li> Javier Arribas, 2015. jarribas(at)cttc.es * <li> Javier Arribas, 2015. jarribas(at)cttc.es
* </ul> * </ul>
* *
* Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
* *
@ -33,9 +33,8 @@
*/ */
#include "cuda_multicorrelator.h" #include "cuda_multicorrelator.h"
#include <stdio.h>
#include <iostream> #include <iostream>
#include <stdio.h>
// For the CUDA runtime routines (prefixed with "cuda_") // For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -53,22 +52,21 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
int vectorN, int vectorN,
int elementN, int elementN,
float rem_carrier_phase_in_rad, float rem_carrier_phase_in_rad,
float phase_step_rad float phase_step_rad)
)
{ {
//Accumulators cache //Accumulators cache
__shared__ GPU_Complex accumResult[ACCUM_N]; __shared__ GPU_Complex accumResult[ACCUM_N];
// CUDA version of floating point NCO and vector dot product integrated // CUDA version of floating point NCO and vector dot product integrated
float sin; float sin;
float cos; float cos;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; for (int i = blockIdx.x * blockDim.x + threadIdx.x;
i < elementN; i < elementN;
i += blockDim.x * gridDim.x) i += blockDim.x * gridDim.x)
{ {
__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos); __sincosf(rem_carrier_phase_in_rad + i * phase_step_rad, &sin, &cos);
d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos,-sin); d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos, -sin);
} }
__syncthreads(); __syncthreads();
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
@ -77,273 +75,279 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
// from total number of thread blocks // from total number of thread blocks
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
{
//int vectorBase = IMUL(elementN, vec);
//int vectorEnd = elementN;
////////////////////////////////////////////////////////////////////////
// Each accumulator cycles through vectors with
// stride equal to number of total number of accumulators ACCUM_N
// At this stage ACCUM_N is only preferred be a multiple of warp size
// to meet memory coalescing alignment constraints.
////////////////////////////////////////////////////////////////////////
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
{ {
GPU_Complex sum = GPU_Complex(0,0); //int vectorBase = IMUL(elementN, vec);
float local_code_chip_index=0.0;; //int vectorEnd = elementN;
//float code_phase;
for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
{
//original sample code
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
//custom code for multitap correlator ////////////////////////////////////////////////////////////////////////
// 1.resample local code for the current shift // Each accumulator cycles through vectors with
// stride equal to number of total number of accumulators ACCUM_N
// At this stage ACCUM_N is only preferred be a multiple of warp size
// to meet memory coalescing alignment constraints.
////////////////////////////////////////////////////////////////////////
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
{
GPU_Complex sum = GPU_Complex(0, 0);
float local_code_chip_index = 0.0;
;
//float code_phase;
for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
{
//original sample code
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); //custom code for multitap correlator
// 1.resample local code for the current shift
//Take into account that in multitap correlators, the shifts can be negative! local_code_chip_index = fmodf(code_phase_step_chips * __int2float_rd(pos) + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
if (local_code_chip_index<0.0) local_code_chip_index+=(code_length_chips-1);
//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
// 2.correlate
sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
} //Take into account that in multitap correlators, the shifts can be negative!
accumResult[iAccum] = sum; if (local_code_chip_index < 0.0) local_code_chip_index += (code_length_chips - 1);
//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
// 2.correlate
sum.multiply_acc(d_sig_wiped[pos], d_local_code_in[__float2int_rd(local_code_chip_index)]);
}
accumResult[iAccum] = sum;
}
////////////////////////////////////////////////////////////////////////
// Perform tree-like reduction of accumulators' results.
// ACCUM_N has to be power of two at this stage
////////////////////////////////////////////////////////////////////////
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
{
__syncthreads();
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
{
accumResult[iAccum] += accumResult[stride + iAccum];
}
}
if (threadIdx.x == 0)
{
d_corr_out[vec] = accumResult[0];
}
} }
////////////////////////////////////////////////////////////////////////
// Perform tree-like reduction of accumulators' results.
// ACCUM_N has to be power of two at this stage
////////////////////////////////////////////////////////////////////////
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
{
__syncthreads();
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
{
accumResult[iAccum] += accumResult[stride + iAccum];
}
}
if (threadIdx.x == 0)
{
d_corr_out[vec] = accumResult[0];
}
}
} }
bool cuda_multicorrelator::init_cuda_integrated_resampler( bool cuda_multicorrelator::init_cuda_integrated_resampler(
int signal_length_samples, int signal_length_samples,
int code_length_chips, int code_length_chips,
int n_correlators int n_correlators)
)
{ {
// use command-line specified CUDA device, otherwise use device with highest Gflops/s // use command-line specified CUDA device, otherwise use device with highest Gflops/s
// findCudaDevice(argc, (const char **)argv); // findCudaDevice(argc, (const char **)argv);
cudaDeviceProp prop; cudaDeviceProp prop;
int num_devices, device; int num_devices, device;
cudaGetDeviceCount(&num_devices); cudaGetDeviceCount(&num_devices);
if (num_devices > 1) { if (num_devices > 1)
int max_multiprocessors = 0, max_device = 0; {
for (device = 0; device < num_devices; device++) { int max_multiprocessors = 0, max_device = 0;
cudaDeviceProp properties; for (device = 0; device < num_devices; device++)
cudaGetDeviceProperties(&properties, device); {
if (max_multiprocessors < properties.multiProcessorCount) { cudaDeviceProp properties;
max_multiprocessors = properties.multiProcessorCount; cudaGetDeviceProperties(&properties, device);
max_device = device; if (max_multiprocessors < properties.multiProcessorCount)
} {
printf("Found GPU device # %i\n",device); max_multiprocessors = properties.multiProcessorCount;
} max_device = device;
//cudaSetDevice(max_device); }
printf("Found GPU device # %i\n", device);
}
//cudaSetDevice(max_device);
//set random device! //set random device!
selected_gps_device=rand() % num_devices;//generates a random number between 0 and num_devices to split the threads between GPUs selected_gps_device = rand() % num_devices; //generates a random number between 0 and num_devices to split the threads between GPUs
cudaSetDevice(selected_gps_device); cudaSetDevice(selected_gps_device);
cudaGetDeviceProperties( &prop, max_device ); cudaGetDeviceProperties(&prop, max_device);
//debug code //debug code
if (prop.canMapHostMemory != 1) { if (prop.canMapHostMemory != 1)
printf( "Device can not map memory.\n" ); {
} printf("Device can not map memory.\n");
printf("L2 Cache size= %u \n",prop.l2CacheSize); }
printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); printf("L2 Cache size= %u \n", prop.l2CacheSize);
printf("maxGridSize= %i \n",prop.maxGridSize[0]); printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock);
printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); printf("maxGridSize= %i \n", prop.maxGridSize[0]);
printf("deviceOverlap= %i \n",prop.deviceOverlap); printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock);
printf("multiProcessorCount= %i \n",prop.multiProcessorCount); printf("deviceOverlap= %i \n", prop.deviceOverlap);
}else{ printf("multiProcessorCount= %i \n", prop.multiProcessorCount);
cudaGetDevice( &selected_gps_device); }
cudaGetDeviceProperties( &prop, selected_gps_device ); else
//debug code {
if (prop.canMapHostMemory != 1) { cudaGetDevice(&selected_gps_device);
printf( "Device can not map memory.\n" ); cudaGetDeviceProperties(&prop, selected_gps_device);
} //debug code
if (prop.canMapHostMemory != 1)
{
printf("Device can not map memory.\n");
}
printf("L2 Cache size= %u \n",prop.l2CacheSize); printf("L2 Cache size= %u \n", prop.l2CacheSize);
printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); printf("maxThreadsPerBlock= %u \n", prop.maxThreadsPerBlock);
printf("maxGridSize= %i \n",prop.maxGridSize[0]); printf("maxGridSize= %i \n", prop.maxGridSize[0]);
printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); printf("sharedMemPerBlock= %lu \n", prop.sharedMemPerBlock);
printf("deviceOverlap= %i \n",prop.deviceOverlap); printf("deviceOverlap= %i \n", prop.deviceOverlap);
printf("multiProcessorCount= %i \n",prop.multiProcessorCount); printf("multiProcessorCount= %i \n", prop.multiProcessorCount);
} }
// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); // (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
size_t size = signal_length_samples * sizeof(GPU_Complex); size_t size = signal_length_samples * sizeof(GPU_Complex);
//********* ZERO COPY VERSION ************ //********* ZERO COPY VERSION ************
// Set flag to enable zero copy access // Set flag to enable zero copy access
// Optimal in shared memory devices (like Jetson K1) // Optimal in shared memory devices (like Jetson K1)
//cudaSetDeviceFlags(cudaDeviceMapHost); //cudaSetDeviceFlags(cudaDeviceMapHost);
//******** CudaMalloc version *********** //******** CudaMalloc version ***********
// input signal GPU memory (can be mapped to CPU memory in shared memory devices!) // input signal GPU memory (can be mapped to CPU memory in shared memory devices!)
// cudaMalloc((void **)&d_sig_in, size); // cudaMalloc((void **)&d_sig_in, size);
// cudaMemset(d_sig_in,0,size); // cudaMemset(d_sig_in,0,size);
// Doppler-free signal (internal GPU memory) // Doppler-free signal (internal GPU memory)
cudaMalloc((void **)&d_sig_doppler_wiped, size); cudaMalloc((void **)&d_sig_doppler_wiped, size);
cudaMemset(d_sig_doppler_wiped,0,size); cudaMemset(d_sig_doppler_wiped, 0, size);
// Local code GPU memory (can be mapped to CPU memory in shared memory devices!) // Local code GPU memory (can be mapped to CPU memory in shared memory devices!)
cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips); cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>) * code_length_chips);
cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips); cudaMemset(d_local_codes_in, 0, sizeof(std::complex<float>) * code_length_chips);
d_code_length_chips=code_length_chips; d_code_length_chips = code_length_chips;
// Vector with the chip shifts for each correlator tap // Vector with the chip shifts for each correlator tap
//GPU memory (can be mapped to CPU memory in shared memory devices!) //GPU memory (can be mapped to CPU memory in shared memory devices!)
cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators); cudaMalloc((void **)&d_shifts_chips, sizeof(float) * n_correlators);
cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators); cudaMemset(d_shifts_chips, 0, sizeof(float) * n_correlators);
//scalars //scalars
//cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators); //cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
//cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators); //cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
// Launch the Vector Add CUDA Kernel // Launch the Vector Add CUDA Kernel
// TODO: write a smart load balance using device info! // TODO: write a smart load balance using device info!
threadsPerBlock = 64; threadsPerBlock = 64;
blocksPerGrid = 128;//(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; blocksPerGrid = 128; //(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
cudaStreamCreate (&stream1) ; cudaStreamCreate(&stream1);
//cudaStreamCreate (&stream2) ; //cudaStreamCreate (&stream2) ;
return true; return true;
} }
bool cuda_multicorrelator::set_local_code_and_taps( bool cuda_multicorrelator::set_local_code_and_taps(
int code_length_chips, int code_length_chips,
const std::complex<float>* local_codes_in, const std::complex<float> *local_codes_in,
float *shifts_chips, float *shifts_chips,
int n_correlators int n_correlators)
)
{ {
cudaSetDevice(selected_gps_device);
//********* ZERO COPY VERSION ************
// // Get device pointer from host memory. No allocation or memcpy
// cudaError_t code;
// // local code CPU -> GPU copy memory
// code=cudaHostGetDevicePointer((void **)&d_local_codes_in, (void *) local_codes_in, 0);
// if (code!=cudaSuccess)
// {
// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
// }
// // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
// code=cudaHostGetDevicePointer((void **)&d_shifts_chips, (void *) shifts_chips, 0);
// if (code!=cudaSuccess)
// {
// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
// }
cudaSetDevice(selected_gps_device); //******** CudaMalloc version ***********
//********* ZERO COPY VERSION ************
// // Get device pointer from host memory. No allocation or memcpy
// cudaError_t code;
// // local code CPU -> GPU copy memory
// code=cudaHostGetDevicePointer((void **)&d_local_codes_in, (void *) local_codes_in, 0);
// if (code!=cudaSuccess)
// {
// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
// }
// // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
// code=cudaHostGetDevicePointer((void **)&d_shifts_chips, (void *) shifts_chips, 0);
// if (code!=cudaSuccess)
// {
// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
// }
//******** CudaMalloc version ***********
//local code CPU -> GPU copy memory //local code CPU -> GPU copy memory
cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1); cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex) * code_length_chips, cudaMemcpyHostToDevice, stream1);
d_code_length_chips=code_length_chips; d_code_length_chips = code_length_chips;
//Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) //Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators, cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float) * n_correlators,
cudaMemcpyHostToDevice,stream1); cudaMemcpyHostToDevice, stream1);
return true; return true;
} }
bool cuda_multicorrelator::set_input_output_vectors( bool cuda_multicorrelator::set_input_output_vectors(
std::complex<float>* corr_out, std::complex<float> *corr_out,
std::complex<float>* sig_in std::complex<float> *sig_in)
)
{ {
cudaSetDevice(selected_gps_device);
// Save CPU pointers
d_sig_in_cpu = sig_in;
d_corr_out_cpu = corr_out;
cudaSetDevice(selected_gps_device); // Zero Copy version
// Save CPU pointers // Get device pointer from host memory. No allocation or memcpy
d_sig_in_cpu =sig_in; cudaError_t code;
d_corr_out_cpu = corr_out; code = cudaHostGetDevicePointer((void **)&d_sig_in, (void *)sig_in, 0);
code = cudaHostGetDevicePointer((void **)&d_corr_out, (void *)corr_out, 0);
// Zero Copy version if (code != cudaSuccess)
// Get device pointer from host memory. No allocation or memcpy {
cudaError_t code; printf("cuda cudaHostGetDevicePointer error \r\n");
code=cudaHostGetDevicePointer((void **)&d_sig_in, (void *) sig_in, 0); }
code=cudaHostGetDevicePointer((void **)&d_corr_out, (void *) corr_out, 0); return true;
if (code!=cudaSuccess)
{
printf("cuda cudaHostGetDevicePointer error \r\n");
}
return true;
} }
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } #define gpuErrchk(ans) \
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { \
gpuAssert((ans), __FILE__, __LINE__); \
}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{ {
if (code != cudaSuccess) if (code != cudaSuccess)
{ {
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code); if (abort) exit(code);
} }
} }
bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
float rem_carrier_phase_in_rad, float rem_carrier_phase_in_rad,
float phase_step_rad, float phase_step_rad,
float code_phase_step_chips, float code_phase_step_chips,
float rem_code_phase_chips, float rem_code_phase_chips,
int signal_length_samples, int signal_length_samples,
int n_correlators) int n_correlators)
{ {
cudaSetDevice(selected_gps_device);
cudaSetDevice(selected_gps_device); // cudaMemCpy version
// cudaMemCpy version //size_t memSize = signal_length_samples * sizeof(std::complex<float>);
//size_t memSize = signal_length_samples * sizeof(std::complex<float>); // input signal CPU -> GPU copy memory
// input signal CPU -> GPU copy memory
//cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize, //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize,
// cudaMemcpyHostToDevice, stream2); // cudaMemcpyHostToDevice, stream2);
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
//launch the multitap correlator with integrated local code resampler! //launch the multitap correlator with integrated local code resampler!
Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>( Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
d_corr_out, d_corr_out,
d_sig_in, d_sig_in,
d_sig_doppler_wiped, d_sig_doppler_wiped,
d_local_codes_in, d_local_codes_in,
d_shifts_chips, d_shifts_chips,
d_code_length_chips, d_code_length_chips,
code_phase_step_chips, code_phase_step_chips,
rem_code_phase_chips, rem_code_phase_chips,
n_correlators, n_correlators,
signal_length_samples, signal_length_samples,
rem_carrier_phase_in_rad, rem_carrier_phase_in_rad,
phase_step_rad phase_step_rad);
);
gpuErrchk( cudaPeekAtLastError() ); gpuErrchk(cudaPeekAtLastError());
gpuErrchk( cudaStreamSynchronize(stream1)); gpuErrchk(cudaStreamSynchronize(stream1));
// cudaMemCpy version // cudaMemCpy version
// Copy the device result vector in device memory to the host result vector // Copy the device result vector in device memory to the host result vector
// in host memory. // in host memory.
//scalar products (correlators outputs) //scalar products (correlators outputs)
@ -352,37 +356,38 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
return true; return true;
} }
cuda_multicorrelator::cuda_multicorrelator() cuda_multicorrelator::cuda_multicorrelator()
{ {
d_sig_in=NULL; d_sig_in = NULL;
d_nco_in=NULL; d_nco_in = NULL;
d_sig_doppler_wiped=NULL; d_sig_doppler_wiped = NULL;
d_local_codes_in=NULL; d_local_codes_in = NULL;
d_shifts_samples=NULL; d_shifts_samples = NULL;
d_shifts_chips=NULL; d_shifts_chips = NULL;
d_corr_out=NULL; d_corr_out = NULL;
threadsPerBlock=0; threadsPerBlock = 0;
blocksPerGrid=0; blocksPerGrid = 0;
d_code_length_chips=0; d_code_length_chips = 0;
} }
bool cuda_multicorrelator::free_cuda() bool cuda_multicorrelator::free_cuda()
{ {
// Free device global memory // Free device global memory
if (d_sig_in!=NULL) cudaFree(d_sig_in); if (d_sig_in != NULL) cudaFree(d_sig_in);
if (d_nco_in!=NULL) cudaFree(d_nco_in); if (d_nco_in != NULL) cudaFree(d_nco_in);
if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped); if (d_sig_doppler_wiped != NULL) cudaFree(d_sig_doppler_wiped);
if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in); if (d_local_codes_in != NULL) cudaFree(d_local_codes_in);
if (d_corr_out!=NULL) cudaFree(d_corr_out); if (d_corr_out != NULL) cudaFree(d_corr_out);
if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples); if (d_shifts_samples != NULL) cudaFree(d_shifts_samples);
if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips); if (d_shifts_chips != NULL) cudaFree(d_shifts_chips);
// Reset the device and exit // Reset the device and exit
// cudaDeviceReset causes the driver to clean up all state. While // cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also // not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being // needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be // profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits // flushed before the application exits
cudaDeviceReset(); cudaDeviceReset();
return true; return true;
} }

View File

@ -1,11 +1,11 @@
/*! /*!
* \file cuda_multicorrelator.h * \file cuda_multicorrelator.h
* \brief High optimized CUDA GPU vector multiTAP correlator class * \brief Highly optimized CUDA GPU vector multiTAP correlator class
* \authors <ul> * \authors <ul>
* <li> Javier Arribas, 2015. jarribas(at)cttc.es * <li> Javier Arribas, 2015. jarribas(at)cttc.es
* </ul> * </ul>
* *
* Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs * Class that implements a highly optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
* *
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
* *
@ -92,6 +92,7 @@ struct GPU_Complex
} }
}; };
struct GPU_Complex_Short struct GPU_Complex_Short
{ {
float r; float r;
@ -149,7 +150,6 @@ private:
GPU_Complex* d_local_codes_in; GPU_Complex* d_local_codes_in;
GPU_Complex* d_corr_out; GPU_Complex* d_corr_out;
//
std::complex<float>* d_sig_in_cpu; std::complex<float>* d_sig_in_cpu;
std::complex<float>* d_corr_out_cpu; std::complex<float>* d_corr_out_cpu;

View File

@ -36,6 +36,7 @@
#include <cmath> #include <cmath>
// All the outputs are in RADIANS // All the outputs are in RADIANS
/* /*
* FLL four quadrant arctan discriminator: * FLL four quadrant arctan discriminator:
* \f{equation} * \f{equation}
@ -45,7 +46,6 @@
* \f$I_{PS1},Q_{PS1}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_1\f$, and * \f$I_{PS1},Q_{PS1}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_1\f$, and
* \f$I_{PS2},Q_{PS2}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_2\f$. The output is in [radians/second]. * \f$I_{PS2},Q_{PS2}\f$ are the inphase and quadrature prompt correlator outputs respectively at sample time \f$t_2\f$. The output is in [radians/second].
*/ */
double fll_four_quadrant_atan(gr_complex prompt_s1, gr_complex prompt_s2, double t1, double t2) double fll_four_quadrant_atan(gr_complex prompt_s1, gr_complex prompt_s2, double t1, double t2)
{ {
double cross, dot; double cross, dot;
@ -105,6 +105,7 @@ double dll_nc_e_minus_l_normalized(gr_complex early_s1, gr_complex late_s1)
return 0.5 * (P_early - P_late) / (P_early + P_late); return 0.5 * (P_early - P_late) / (P_early + P_late);
} }
/* /*
* DLL Noncoherent Very Early Minus Late Power (VEMLP) normalized discriminator, using the outputs * DLL Noncoherent Very Early Minus Late Power (VEMLP) normalized discriminator, using the outputs
* of four correlators, Very Early (VE), Early (E), Late (L) and Very Late (VL): * of four correlators, Very Early (VE), Early (E), Late (L) and Very Late (VL):