mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2024-12-12 19:20:32 +00:00
Pinned memory (Zero copy) huge improvement for GPU tracking.
This commit is contained in:
parent
a4655e2b03
commit
2039e998ff
@ -17,7 +17,7 @@ ControlThread.wait_for_flowgraph=false
|
|||||||
SignalSource.implementation=File_Signal_Source
|
SignalSource.implementation=File_Signal_Source
|
||||||
|
|
||||||
;#filename: path to file with the captured GNSS signal samples to be processed
|
;#filename: path to file with the captured GNSS signal samples to be processed
|
||||||
SignalSource.filename=/home/javier/signals/4msps.dat
|
SignalSource.filename=/media/javier/SISTEMA/signals/New York/4msps.dat
|
||||||
|
|
||||||
;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version.
|
;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version.
|
||||||
SignalSource.item_type=gr_complex
|
SignalSource.item_type=gr_complex
|
||||||
@ -165,7 +165,7 @@ Resampler.sample_freq_out=4000000
|
|||||||
|
|
||||||
;######### CHANNELS GLOBAL CONFIG ############
|
;######### CHANNELS GLOBAL CONFIG ############
|
||||||
;#count: Number of available GPS satellite channels.
|
;#count: Number of available GPS satellite channels.
|
||||||
Channels_GPS.count=1
|
Channels_GPS.count=8
|
||||||
;#count: Number of available Galileo satellite channels.
|
;#count: Number of available Galileo satellite channels.
|
||||||
Channels_Galileo.count=0
|
Channels_Galileo.count=0
|
||||||
;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver
|
;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver
|
||||||
|
@ -128,13 +128,9 @@ private:
|
|||||||
|
|
||||||
//GPU HOST PINNED MEMORY IN/OUT VECTORS
|
//GPU HOST PINNED MEMORY IN/OUT VECTORS
|
||||||
gr_complex* in_gpu;
|
gr_complex* in_gpu;
|
||||||
gr_complex* d_carr_sign_gpu;
|
|
||||||
gr_complex* d_local_codes_gpu;
|
|
||||||
float* d_local_code_shift_chips;
|
float* d_local_code_shift_chips;
|
||||||
gr_complex* d_corr_outs_gpu;
|
gr_complex* d_corr_outs_gpu;
|
||||||
cuda_multicorrelator *multicorrelator_gpu;
|
cuda_multicorrelator *multicorrelator_gpu;
|
||||||
|
|
||||||
|
|
||||||
gr_complex* d_ca_code;
|
gr_complex* d_ca_code;
|
||||||
|
|
||||||
gr_complex *d_Early;
|
gr_complex *d_Early;
|
||||||
|
@ -32,26 +32,14 @@
|
|||||||
* -------------------------------------------------------------------------
|
* -------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
// On G80-class hardware 24-bit multiplication takes 4 clocks per warp
|
|
||||||
// (the same as for floating point multiplication and addition),
|
|
||||||
// whereas full 32-bit multiplication takes 16 clocks per warp.
|
|
||||||
// So if integer multiplication operands are guaranteed to fit into 24 bits
|
|
||||||
// (always lie withtin [-8M, 8M - 1] range in signed case),
|
|
||||||
// explicit 24-bit multiplication is preferred for performance.
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
#define IMUL(a, b) __mul24(a, b)
|
|
||||||
|
|
||||||
#include "cuda_multicorrelator.h"
|
#include "cuda_multicorrelator.h"
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <iostream>
|
||||||
// For the CUDA runtime routines (prefixed with "cuda_")
|
// For the CUDA runtime routines (prefixed with "cuda_")
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
#define ACCUM_N 128
|
||||||
#define ACCUM_N 256
|
|
||||||
|
|
||||||
|
|
||||||
__global__ void scalarProdGPUCPXxN_shifts_chips(
|
__global__ void scalarProdGPUCPXxN_shifts_chips(
|
||||||
GPU_Complex *d_corr_out,
|
GPU_Complex *d_corr_out,
|
||||||
@ -90,15 +78,17 @@ __global__ void scalarProdGPUCPXxN_shifts_chips(
|
|||||||
|
|
||||||
for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
|
for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
|
||||||
{
|
{
|
||||||
|
//original sample code
|
||||||
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
|
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
|
||||||
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
|
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
|
||||||
//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
|
//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
|
||||||
|
|
||||||
|
//custom code for multitap correlator
|
||||||
// 1.resample local code for the current shift
|
// 1.resample local code for the current shift
|
||||||
float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
|
float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
|
||||||
//TODO: Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
|
if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
|
||||||
|
//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
|
||||||
// 2.correlate
|
// 2.correlate
|
||||||
sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
|
sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
|
||||||
|
|
||||||
@ -127,163 +117,6 @@ __global__ void scalarProdGPUCPXxN_shifts_chips(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Calculate scalar products of VectorN vectors of ElementN elements on GPU
|
|
||||||
// Parameters restrictions:
|
|
||||||
// 1) ElementN is strongly preferred to be a multiple of warp size to
|
|
||||||
// meet alignment constraints of memory coalescing.
|
|
||||||
// 2) ACCUM_N must be a power of two.
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
|
|
||||||
__global__ void scalarProdGPUCPXxN_shifts(
|
|
||||||
GPU_Complex *d_corr_out,
|
|
||||||
GPU_Complex *d_sig_in,
|
|
||||||
GPU_Complex *d_local_codes_in,
|
|
||||||
int *d_shifts_samples,
|
|
||||||
int vectorN,
|
|
||||||
int elementN
|
|
||||||
)
|
|
||||||
{
|
|
||||||
//Accumulators cache
|
|
||||||
__shared__ GPU_Complex accumResult[ACCUM_N];
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Cycle through every pair of vectors,
|
|
||||||
// taking into account that vector counts can be different
|
|
||||||
// from total number of thread blocks
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
|
|
||||||
{
|
|
||||||
int vectorBase = IMUL(elementN, vec);
|
|
||||||
int vectorEnd = vectorBase + elementN;
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
// Each accumulator cycles through vectors with
|
|
||||||
// stride equal to number of total number of accumulators ACCUM_N
|
|
||||||
// At this stage ACCUM_N is only preferred be a multiple of warp size
|
|
||||||
// to meet memory coalescing alignment constraints.
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
|
|
||||||
{
|
|
||||||
GPU_Complex sum = GPU_Complex(0,0);
|
|
||||||
|
|
||||||
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
|
|
||||||
{
|
|
||||||
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
|
|
||||||
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
|
|
||||||
sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos-vectorBase+d_shifts_samples[vec]]);
|
|
||||||
}
|
|
||||||
accumResult[iAccum] = sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
// Perform tree-like reduction of accumulators' results.
|
|
||||||
// ACCUM_N has to be power of two at this stage
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
|
|
||||||
{
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
|
|
||||||
{
|
|
||||||
accumResult[iAccum] += accumResult[stride + iAccum];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
|
||||||
{
|
|
||||||
d_corr_out[vec] = accumResult[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
__global__ void scalarProdGPUCPXxN(
|
|
||||||
GPU_Complex *d_corr_out,
|
|
||||||
GPU_Complex *d_sig_in,
|
|
||||||
GPU_Complex *d_local_codes_in,
|
|
||||||
int vectorN,
|
|
||||||
int elementN
|
|
||||||
)
|
|
||||||
{
|
|
||||||
//Accumulators cache
|
|
||||||
__shared__ GPU_Complex accumResult[ACCUM_N];
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
// Cycle through every pair of vectors,
|
|
||||||
// taking into account that vector counts can be different
|
|
||||||
// from total number of thread blocks
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
|
|
||||||
{
|
|
||||||
//int vectorBase = IMUL(elementN, vec);
|
|
||||||
//int vectorEnd = vectorBase + elementN;
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
// Each accumulator cycles through vectors with
|
|
||||||
// stride equal to number of total number of accumulators ACCUM_N
|
|
||||||
// At this stage ACCUM_N is only preferred be a multiple of warp size
|
|
||||||
// to meet memory coalescing alignment constraints.
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
|
|
||||||
{
|
|
||||||
GPU_Complex sum = GPU_Complex(0,0);
|
|
||||||
|
|
||||||
//for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
|
|
||||||
for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
|
|
||||||
{
|
|
||||||
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
|
|
||||||
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
|
|
||||||
//sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]);
|
|
||||||
sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos]);
|
|
||||||
}
|
|
||||||
accumResult[iAccum] = sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
// Perform tree-like reduction of accumulators' results.
|
|
||||||
// ACCUM_N has to be power of two at this stage
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
|
||||||
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
|
|
||||||
{
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
|
|
||||||
{
|
|
||||||
accumResult[iAccum] += accumResult[stride + iAccum];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (threadIdx.x == 0)
|
|
||||||
{
|
|
||||||
d_corr_out[vec] = accumResult[0];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
//*********** CUDA processing **************
|
|
||||||
// Treads: a minimal parallel execution code on GPU
|
|
||||||
// Blocks: a set of N threads
|
|
||||||
/**
|
|
||||||
* CUDA Kernel Device code
|
|
||||||
*
|
|
||||||
* Computes the vectorial product of A and B into C. The 3 vectors have the same
|
|
||||||
* number of elements numElements.
|
|
||||||
*/
|
|
||||||
__global__ void CUDA_32fc_x2_multiply_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
|
|
||||||
{
|
|
||||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
i < numElements;
|
|
||||||
i += blockDim.x * gridDim.x)
|
|
||||||
{
|
|
||||||
C[i] = A[i] * B[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CUDA Kernel Device code
|
* CUDA Kernel Device code
|
||||||
*
|
*
|
||||||
@ -292,21 +125,7 @@ __global__ void CUDA_32fc_x2_multiply_32fc( GPU_Complex *A, GPU_Complex *B,
|
|||||||
__global__ void
|
__global__ void
|
||||||
CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
|
CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
|
||||||
{
|
{
|
||||||
//*** NCO CPU code (GNURadio FXP NCO)
|
|
||||||
//float sin_f, cos_f;
|
|
||||||
//float phase_step_rad = static_cast<float>(2 * GALILEO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
|
|
||||||
//int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad);
|
|
||||||
//int phase_rad_i = gr::fxpt::float_to_fixed(d_rem_carr_phase_rad);
|
|
||||||
//
|
|
||||||
//for(int i = 0; i < d_current_prn_length_samples; i++)
|
|
||||||
// {
|
|
||||||
// gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f);
|
|
||||||
// d_carr_sign[i] = std::complex<float>(cos_f, -sin_f);
|
|
||||||
// phase_rad_i += phase_step_rad_i;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// CUDA version of floating point NCO and vector dot product integrated
|
// CUDA version of floating point NCO and vector dot product integrated
|
||||||
|
|
||||||
float sin;
|
float sin;
|
||||||
float cos;
|
float cos;
|
||||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
@ -319,110 +138,101 @@ CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
__global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
|
||||||
* CUDA Kernel Device code
|
GPU_Complex *d_corr_out,
|
||||||
*
|
GPU_Complex *d_sig_in,
|
||||||
* Computes the vectorial product of A and B into C. The 3 vectors have the same
|
GPU_Complex *d_sig_wiped,
|
||||||
* number of elements numElements.
|
GPU_Complex *d_local_code_in,
|
||||||
*/
|
float *d_shifts_chips,
|
||||||
__global__ void
|
float code_length_chips,
|
||||||
CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
|
float code_phase_step_chips,
|
||||||
|
float rem_code_phase_chips,
|
||||||
|
int vectorN,
|
||||||
|
int elementN,
|
||||||
|
float rem_carrier_phase_in_rad,
|
||||||
|
float phase_step_rad
|
||||||
|
)
|
||||||
{
|
{
|
||||||
|
//Accumulators cache
|
||||||
|
__shared__ GPU_Complex accumResult[ACCUM_N];
|
||||||
|
|
||||||
|
// CUDA version of floating point NCO and vector dot product integrated
|
||||||
|
float sin;
|
||||||
|
float cos;
|
||||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
i < numElements;
|
i < elementN;
|
||||||
i += blockDim.x * gridDim.x)
|
i += blockDim.x * gridDim.x)
|
||||||
{
|
{
|
||||||
C[i] = A[i] + B[i];
|
__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
|
||||||
|
d_sig_wiped[i] = d_sig_in[i] * GPU_Complex(cos,-sin);
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Cycle through every pair of vectors,
|
||||||
|
// taking into account that vector counts can be different
|
||||||
|
// from total number of thread blocks
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
|
||||||
|
{
|
||||||
|
//int vectorBase = IMUL(elementN, vec);
|
||||||
|
//int vectorEnd = elementN;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Each accumulator cycles through vectors with
|
||||||
|
// stride equal to number of total number of accumulators ACCUM_N
|
||||||
|
// At this stage ACCUM_N is only preferred be a multiple of warp size
|
||||||
|
// to meet memory coalescing alignment constraints.
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
|
||||||
|
{
|
||||||
|
GPU_Complex sum = GPU_Complex(0,0);
|
||||||
|
float local_code_chip_index;
|
||||||
|
//float code_phase;
|
||||||
|
for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
|
||||||
|
{
|
||||||
|
//original sample code
|
||||||
|
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
|
||||||
|
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
|
||||||
|
//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
|
||||||
|
|
||||||
|
//custom code for multitap correlator
|
||||||
|
// 1.resample local code for the current shift
|
||||||
|
|
||||||
|
local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
|
||||||
|
|
||||||
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
|
if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
|
||||||
|
//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
|
||||||
|
// 2.correlate
|
||||||
|
sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
|
||||||
|
|
||||||
|
}
|
||||||
|
accumResult[iAccum] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Perform tree-like reduction of accumulators' results.
|
||||||
|
// ACCUM_N has to be power of two at this stage
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
|
||||||
|
{
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
|
||||||
|
{
|
||||||
|
accumResult[iAccum] += accumResult[stride + iAccum];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
{
|
||||||
|
d_corr_out[vec] = accumResult[0];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators)
|
|
||||||
{
|
|
||||||
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
|
|
||||||
// findCudaDevice(argc, (const char **)argv);
|
|
||||||
// cudaDeviceProp prop;
|
|
||||||
// int num_devices, device;
|
|
||||||
// cudaGetDeviceCount(&num_devices);
|
|
||||||
// if (num_devices > 1) {
|
|
||||||
// int max_multiprocessors = 0, max_device = 0;
|
|
||||||
// for (device = 0; device < num_devices; device++) {
|
|
||||||
// cudaDeviceProp properties;
|
|
||||||
// cudaGetDeviceProperties(&properties, device);
|
|
||||||
// if (max_multiprocessors < properties.multiProcessorCount) {
|
|
||||||
// max_multiprocessors = properties.multiProcessorCount;
|
|
||||||
// max_device = device;
|
|
||||||
// }
|
|
||||||
// printf("Found GPU device # %i\n",device);
|
|
||||||
// }
|
|
||||||
// //cudaSetDevice(max_device);
|
|
||||||
//
|
|
||||||
// //set random device!
|
|
||||||
// cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs
|
|
||||||
//
|
|
||||||
// cudaGetDeviceProperties( &prop, max_device );
|
|
||||||
// //debug code
|
|
||||||
// if (prop.canMapHostMemory != 1) {
|
|
||||||
// printf( "Device can not map memory.\n" );
|
|
||||||
// }
|
|
||||||
// printf("L2 Cache size= %u \n",prop.l2CacheSize);
|
|
||||||
// printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
|
|
||||||
// printf("maxGridSize= %i \n",prop.maxGridSize[0]);
|
|
||||||
// printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
|
|
||||||
// printf("deviceOverlap= %i \n",prop.deviceOverlap);
|
|
||||||
// printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
|
|
||||||
// }else{
|
|
||||||
// int whichDevice;
|
|
||||||
// cudaGetDevice( &whichDevice );
|
|
||||||
// cudaGetDeviceProperties( &prop, whichDevice );
|
|
||||||
// //debug code
|
|
||||||
// if (prop.canMapHostMemory != 1) {
|
|
||||||
// printf( "Device can not map memory.\n" );
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// printf("L2 Cache size= %u \n",prop.l2CacheSize);
|
|
||||||
// printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
|
|
||||||
// printf("maxGridSize= %i \n",prop.maxGridSize[0]);
|
|
||||||
// printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
|
|
||||||
// printf("deviceOverlap= %i \n",prop.deviceOverlap);
|
|
||||||
// printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
|
|
||||||
// }
|
|
||||||
|
|
||||||
// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
|
|
||||||
|
|
||||||
|
|
||||||
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
|
|
||||||
|
|
||||||
size_t size = signal_length_samples * sizeof(GPU_Complex);
|
|
||||||
|
|
||||||
cudaMalloc((void **)&d_sig_in, size);
|
|
||||||
// (cudaMalloc((void **)&d_nco_in, size));
|
|
||||||
cudaMalloc((void **)&d_sig_doppler_wiped, size);
|
|
||||||
|
|
||||||
// old version: all local codes are independent vectors
|
|
||||||
// (cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
|
|
||||||
|
|
||||||
// new version: only one vector with extra samples to shift the local code for the correlator set
|
|
||||||
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
|
||||||
size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex);
|
|
||||||
cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes);
|
|
||||||
cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators);
|
|
||||||
|
|
||||||
//scalars
|
|
||||||
cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
|
|
||||||
|
|
||||||
// Launch the Vector Add CUDA Kernel
|
|
||||||
threadsPerBlock = 256;
|
|
||||||
blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
|
|
||||||
|
|
||||||
cudaStreamCreate (&stream1) ;
|
|
||||||
cudaStreamCreate (&stream2) ;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool cuda_multicorrelator::init_cuda_integrated_resampler(
|
bool cuda_multicorrelator::init_cuda_integrated_resampler(
|
||||||
const int argc, const char **argv,
|
|
||||||
int signal_length_samples,
|
int signal_length_samples,
|
||||||
int code_length_chips,
|
int code_length_chips,
|
||||||
int n_correlators
|
int n_correlators
|
||||||
@ -480,34 +290,45 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler(
|
|||||||
// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
|
// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
|
||||||
|
|
||||||
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
|
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
|
||||||
|
|
||||||
size_t size = signal_length_samples * sizeof(GPU_Complex);
|
size_t size = signal_length_samples * sizeof(GPU_Complex);
|
||||||
|
|
||||||
cudaMalloc((void **)&d_sig_in, size);
|
//********* ZERO COPY VERSION ************
|
||||||
cudaMemset(d_sig_in,0,size);
|
// Set flag to enable zero copy access
|
||||||
|
// Optimal in shared memory devices (like Jetson K1)
|
||||||
|
cudaSetDeviceFlags(cudaDeviceMapHost);
|
||||||
|
|
||||||
// (cudaMalloc((void **)&d_nco_in, size));
|
//******** CudaMalloc version ***********
|
||||||
|
|
||||||
|
// input signal GPU memory (can be mapped to CPU memory in shared memory devices!)
|
||||||
|
// cudaMalloc((void **)&d_sig_in, size);
|
||||||
|
// cudaMemset(d_sig_in,0,size);
|
||||||
|
|
||||||
|
// Doppler-free signal (internal GPU memory)
|
||||||
cudaMalloc((void **)&d_sig_doppler_wiped, size);
|
cudaMalloc((void **)&d_sig_doppler_wiped, size);
|
||||||
cudaMemset(d_sig_doppler_wiped,0,size);
|
cudaMemset(d_sig_doppler_wiped,0,size);
|
||||||
|
|
||||||
|
// Local code GPU memory (can be mapped to CPU memory in shared memory devices!)
|
||||||
cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips);
|
cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips);
|
||||||
cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips);
|
cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips);
|
||||||
|
|
||||||
d_code_length_chips=code_length_chips;
|
d_code_length_chips=code_length_chips;
|
||||||
|
|
||||||
|
// Vector with the chip shifts for each correlator tap
|
||||||
|
//GPU memory (can be mapped to CPU memory in shared memory devices!)
|
||||||
cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators);
|
cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators);
|
||||||
cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators);
|
cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators);
|
||||||
|
|
||||||
//scalars
|
//scalars
|
||||||
cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
|
//cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators);
|
||||||
cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
|
//cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators);
|
||||||
|
|
||||||
// Launch the Vector Add CUDA Kernel
|
// Launch the Vector Add CUDA Kernel
|
||||||
threadsPerBlock = 256;
|
// TODO: write a smart load balance using device info!
|
||||||
|
threadsPerBlock = 64;
|
||||||
blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
|
blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
|
||||||
|
|
||||||
cudaStreamCreate (&stream1) ;
|
cudaStreamCreate (&stream1) ;
|
||||||
cudaStreamCreate (&stream2) ;
|
//cudaStreamCreate (&stream2) ;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -518,103 +339,57 @@ bool cuda_multicorrelator::set_local_code_and_taps(
|
|||||||
int n_correlators
|
int n_correlators
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
// local code CPU -> GPU copy memory
|
//********* ZERO COPY VERSION ************
|
||||||
|
// // Get device pointer from host memory. No allocation or memcpy
|
||||||
|
// cudaError_t code;
|
||||||
|
// // local code CPU -> GPU copy memory
|
||||||
|
// code=cudaHostGetDevicePointer((void **)&d_local_codes_in, (void *) local_codes_in, 0);
|
||||||
|
// if (code!=cudaSuccess)
|
||||||
|
// {
|
||||||
|
// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
|
||||||
|
// }
|
||||||
|
// // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
|
||||||
|
// code=cudaHostGetDevicePointer((void **)&d_shifts_chips, (void *) shifts_chips, 0);
|
||||||
|
// if (code!=cudaSuccess)
|
||||||
|
// {
|
||||||
|
// printf("cuda cudaHostGetDevicePointer error in set_local_code_and_taps \r\n");
|
||||||
|
// }
|
||||||
|
|
||||||
|
//******** CudaMalloc version ***********
|
||||||
|
//local code CPU -> GPU copy memory
|
||||||
cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1);
|
cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1);
|
||||||
d_code_length_chips=(float)code_length_chips;
|
d_code_length_chips=(float)code_length_chips;
|
||||||
|
|
||||||
// Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
|
//Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
|
||||||
cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
|
cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
|
||||||
cudaMemcpyHostToDevice,stream1);
|
cudaMemcpyHostToDevice,stream1);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool cuda_multicorrelator::set_input_output_vectors(
|
||||||
|
|
||||||
bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
|
|
||||||
std::complex<float>* corr_out,
|
std::complex<float>* corr_out,
|
||||||
const std::complex<float>* sig_in,
|
std::complex<float>* sig_in
|
||||||
const std::complex<float>* local_codes_in,
|
)
|
||||||
float rem_carrier_phase_in_rad,
|
{
|
||||||
float phase_step_rad,
|
|
||||||
const int *shifts_samples,
|
// Save CPU pointers
|
||||||
int signal_length_samples,
|
d_sig_in_cpu =sig_in;
|
||||||
int n_correlators)
|
d_corr_out_cpu = corr_out;
|
||||||
|
|
||||||
|
// Zero Copy version
|
||||||
|
// Get device pointer from host memory. No allocation or memcpy
|
||||||
|
cudaError_t code;
|
||||||
|
code=cudaHostGetDevicePointer((void **)&d_sig_in, (void *) sig_in, 0);
|
||||||
|
code=cudaHostGetDevicePointer((void **)&d_corr_out, (void *) corr_out, 0);
|
||||||
|
if (code!=cudaSuccess)
|
||||||
{
|
{
|
||||||
|
printf("cuda cudaHostGetDevicePointer error \r\n");
|
||||||
size_t memSize = signal_length_samples * sizeof(std::complex<float>);
|
}
|
||||||
|
|
||||||
// input signal CPU -> GPU copy memory
|
|
||||||
|
|
||||||
cudaMemcpyAsync(d_sig_in, sig_in, memSize,
|
|
||||||
cudaMemcpyHostToDevice, stream1);
|
|
||||||
|
|
||||||
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
|
|
||||||
// (cudaMemcpyAsync(d_nco_in, nco_in, memSize,
|
|
||||||
// cudaMemcpyHostToDevice, stream1));
|
|
||||||
|
|
||||||
|
|
||||||
// old version: all local codes are independent vectors
|
|
||||||
// (cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
|
|
||||||
// cudaMemcpyHostToDevice, stream2));
|
|
||||||
|
|
||||||
// new version: only one vector with extra samples to shift the local code for the correlator set
|
|
||||||
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
|
||||||
|
|
||||||
// local code CPU -> GPU copy memory
|
|
||||||
cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
|
|
||||||
cudaMemcpyHostToDevice, stream2);
|
|
||||||
// Correlator shifts vector CPU -> GPU copy memory
|
|
||||||
cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
|
|
||||||
cudaMemcpyHostToDevice, stream2);
|
|
||||||
|
|
||||||
|
|
||||||
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
|
|
||||||
cudaStreamSynchronize(stream1);
|
|
||||||
CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
|
|
||||||
|
|
||||||
|
|
||||||
//printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
|
|
||||||
|
|
||||||
//wait for Doppler wipeoff end...
|
|
||||||
cudaStreamSynchronize(stream1);
|
|
||||||
cudaStreamSynchronize(stream2);
|
|
||||||
// (cudaDeviceSynchronize());
|
|
||||||
|
|
||||||
//old
|
|
||||||
// scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
|
|
||||||
// d_corr_out,
|
|
||||||
// d_sig_doppler_wiped,
|
|
||||||
// d_local_codes_in,
|
|
||||||
// 3,
|
|
||||||
// signal_length_samples
|
|
||||||
// );
|
|
||||||
|
|
||||||
//new
|
|
||||||
//launch the multitap correlator
|
|
||||||
scalarProdGPUCPXxN_shifts<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
|
|
||||||
d_corr_out,
|
|
||||||
d_sig_doppler_wiped,
|
|
||||||
d_local_codes_in,
|
|
||||||
d_shifts_samples,
|
|
||||||
n_correlators,
|
|
||||||
signal_length_samples
|
|
||||||
);
|
|
||||||
cudaGetLastError();
|
|
||||||
//wait for correlators end...
|
|
||||||
cudaStreamSynchronize(stream2);
|
|
||||||
// Copy the device result vector in device memory to the host result vector
|
|
||||||
// in host memory.
|
|
||||||
|
|
||||||
//scalar products (correlators outputs)
|
|
||||||
cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
|
|
||||||
cudaMemcpyDeviceToHost);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
|
||||||
|
|
||||||
|
}
|
||||||
bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
|
bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
|
||||||
std::complex<float>* corr_out,
|
|
||||||
const std::complex<float>* sig_in,
|
|
||||||
float rem_carrier_phase_in_rad,
|
float rem_carrier_phase_in_rad,
|
||||||
float phase_step_rad,
|
float phase_step_rad,
|
||||||
float code_phase_step_chips,
|
float code_phase_step_chips,
|
||||||
@ -623,26 +398,40 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
|
|||||||
int n_correlators)
|
int n_correlators)
|
||||||
{
|
{
|
||||||
|
|
||||||
size_t memSize = signal_length_samples * sizeof(std::complex<float>);
|
|
||||||
|
// cudaMemCpy version
|
||||||
|
//size_t memSize = signal_length_samples * sizeof(std::complex<float>);
|
||||||
// input signal CPU -> GPU copy memory
|
// input signal CPU -> GPU copy memory
|
||||||
cudaMemcpyAsync(d_sig_in, sig_in, memSize,
|
//cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize,
|
||||||
cudaMemcpyHostToDevice, stream2);
|
// cudaMemcpyHostToDevice, stream2);
|
||||||
|
|
||||||
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
|
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
|
||||||
|
|
||||||
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
|
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
|
||||||
cudaStreamSynchronize(stream2);
|
//cudaStreamSynchronize(stream2);
|
||||||
|
|
||||||
CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
|
//CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
|
||||||
|
|
||||||
//wait for Doppler wipeoff end...
|
//wait for Doppler wipeoff end...
|
||||||
cudaStreamSynchronize(stream1);
|
//cudaStreamSynchronize(stream1);
|
||||||
cudaStreamSynchronize(stream2);
|
//cudaStreamSynchronize(stream2);
|
||||||
|
|
||||||
//launch the multitap correlator with integrated local code resampler!
|
//launch the multitap correlator with integrated local code resampler!
|
||||||
|
|
||||||
scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
|
// scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
|
||||||
|
// d_corr_out,
|
||||||
|
// d_sig_doppler_wiped,
|
||||||
|
// d_local_codes_in,
|
||||||
|
// d_shifts_chips,
|
||||||
|
// d_code_length_chips,
|
||||||
|
// code_phase_step_chips,
|
||||||
|
// rem_code_phase_chips,
|
||||||
|
// n_correlators,
|
||||||
|
// signal_length_samples
|
||||||
|
// );
|
||||||
|
|
||||||
|
Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
|
||||||
d_corr_out,
|
d_corr_out,
|
||||||
|
d_sig_in,
|
||||||
d_sig_doppler_wiped,
|
d_sig_doppler_wiped,
|
||||||
d_local_codes_in,
|
d_local_codes_in,
|
||||||
d_shifts_chips,
|
d_shifts_chips,
|
||||||
@ -650,23 +439,33 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
|
|||||||
code_phase_step_chips,
|
code_phase_step_chips,
|
||||||
rem_code_phase_chips,
|
rem_code_phase_chips,
|
||||||
n_correlators,
|
n_correlators,
|
||||||
signal_length_samples
|
signal_length_samples,
|
||||||
|
rem_carrier_phase_in_rad,
|
||||||
|
phase_step_rad
|
||||||
);
|
);
|
||||||
|
|
||||||
cudaGetLastError();
|
//debug
|
||||||
|
// std::complex<float>* debug_signal;
|
||||||
|
// debug_signal=static_cast<std::complex<float>*>(malloc(memSize));
|
||||||
|
// cudaMemcpyAsync(debug_signal, d_sig_doppler_wiped, memSize,
|
||||||
|
// cudaMemcpyDeviceToHost,stream1);
|
||||||
|
// cudaStreamSynchronize(stream1);
|
||||||
|
// std::cout<<"d_sig_doppler_wiped GPU="<<debug_signal[456]<<","<<debug_signal[1]<<","<<debug_signal[2]<<","<<debug_signal[3]<<std::endl;
|
||||||
|
|
||||||
|
//cudaGetLastError();
|
||||||
//wait for correlators end...
|
//wait for correlators end...
|
||||||
cudaStreamSynchronize(stream1);
|
//cudaStreamSynchronize(stream1);
|
||||||
// Copy the device result vector in device memory to the host result vector
|
// Copy the device result vector in device memory to the host result vector
|
||||||
// in host memory.
|
// in host memory.
|
||||||
|
|
||||||
//scalar products (correlators outputs)
|
//scalar products (correlators outputs)
|
||||||
cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
|
//cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
|
||||||
cudaMemcpyDeviceToHost,stream1);
|
// cudaMemcpyDeviceToHost,stream1);
|
||||||
|
|
||||||
cudaStreamSynchronize(stream1);
|
cudaStreamSynchronize(stream1);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
cuda_multicorrelator::cuda_multicorrelator()
|
cuda_multicorrelator::cuda_multicorrelator()
|
||||||
{
|
{
|
||||||
d_sig_in=NULL;
|
d_sig_in=NULL;
|
||||||
@ -689,22 +488,16 @@ bool cuda_multicorrelator::free_cuda()
|
|||||||
if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped);
|
if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped);
|
||||||
if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in);
|
if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in);
|
||||||
if (d_corr_out!=NULL) cudaFree(d_corr_out);
|
if (d_corr_out!=NULL) cudaFree(d_corr_out);
|
||||||
|
|
||||||
|
|
||||||
if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples);
|
if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples);
|
||||||
if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips);
|
if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips);
|
||||||
|
|
||||||
|
|
||||||
cudaStreamDestroy(stream1) ;
|
|
||||||
cudaStreamDestroy(stream2) ;
|
|
||||||
|
|
||||||
// Reset the device and exit
|
// Reset the device and exit
|
||||||
// cudaDeviceReset causes the driver to clean up all state. While
|
// cudaDeviceReset causes the driver to clean up all state. While
|
||||||
// not mandatory in normal operation, it is good practice. It is also
|
// not mandatory in normal operation, it is good practice. It is also
|
||||||
// needed to ensure correct operation when the application is being
|
// needed to ensure correct operation when the application is being
|
||||||
// profiled. Calling cudaDeviceReset causes all profile data to be
|
// profiled. Calling cudaDeviceReset causes all profile data to be
|
||||||
// flushed before the application exits
|
// flushed before the application exits
|
||||||
// (cudaDeviceReset());
|
cudaDeviceReset();
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,9 +114,7 @@ class cuda_multicorrelator
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
cuda_multicorrelator();
|
cuda_multicorrelator();
|
||||||
bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
|
|
||||||
bool init_cuda_integrated_resampler(
|
bool init_cuda_integrated_resampler(
|
||||||
const int argc, const char **argv,
|
|
||||||
int signal_length_samples,
|
int signal_length_samples,
|
||||||
int code_length_chips,
|
int code_length_chips,
|
||||||
int n_correlators
|
int n_correlators
|
||||||
@ -127,19 +125,12 @@ public:
|
|||||||
float *shifts_chips,
|
float *shifts_chips,
|
||||||
int n_correlators
|
int n_correlators
|
||||||
);
|
);
|
||||||
|
bool set_input_output_vectors(
|
||||||
|
std::complex<float>* corr_out,
|
||||||
|
std::complex<float>* sig_in
|
||||||
|
);
|
||||||
bool free_cuda();
|
bool free_cuda();
|
||||||
bool Carrier_wipeoff_multicorrelator_cuda(
|
|
||||||
std::complex<float>* corr_out,
|
|
||||||
const std::complex<float>* sig_in,
|
|
||||||
const std::complex<float>* local_codes_in,
|
|
||||||
float rem_carrier_phase_in_rad,
|
|
||||||
float phase_step_rad,
|
|
||||||
const int *shifts_samples,
|
|
||||||
int signal_length_samples,
|
|
||||||
int n_correlators);
|
|
||||||
bool Carrier_wipeoff_multicorrelator_resampler_cuda(
|
bool Carrier_wipeoff_multicorrelator_resampler_cuda(
|
||||||
std::complex<float>* corr_out,
|
|
||||||
const std::complex<float>* sig_in,
|
|
||||||
float rem_carrier_phase_in_rad,
|
float rem_carrier_phase_in_rad,
|
||||||
float phase_step_rad,
|
float phase_step_rad,
|
||||||
float code_phase_step_chips,
|
float code_phase_step_chips,
|
||||||
@ -154,6 +145,11 @@ private:
|
|||||||
GPU_Complex *d_sig_doppler_wiped;
|
GPU_Complex *d_sig_doppler_wiped;
|
||||||
GPU_Complex *d_local_codes_in;
|
GPU_Complex *d_local_codes_in;
|
||||||
GPU_Complex *d_corr_out;
|
GPU_Complex *d_corr_out;
|
||||||
|
|
||||||
|
//
|
||||||
|
std::complex<float> *d_sig_in_cpu;
|
||||||
|
std::complex<float> *d_corr_out_cpu;
|
||||||
|
|
||||||
int *d_shifts_samples;
|
int *d_shifts_samples;
|
||||||
float *d_shifts_chips;
|
float *d_shifts_chips;
|
||||||
float d_code_length_chips;
|
float d_code_length_chips;
|
||||||
@ -162,7 +158,7 @@ private:
|
|||||||
int blocksPerGrid;
|
int blocksPerGrid;
|
||||||
|
|
||||||
cudaStream_t stream1;
|
cudaStream_t stream1;
|
||||||
cudaStream_t stream2;
|
//cudaStream_t stream2;
|
||||||
int num_gpu_devices;
|
int num_gpu_devices;
|
||||||
int selected_device;
|
int selected_device;
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user