mirror of https://github.com/gnss-sdr/gnss-sdr
419 lines
15 KiB
Plaintext
419 lines
15 KiB
Plaintext
/*!
|
|
* \file cuda_multicorrelator.cu
|
|
* \brief High optimized CUDA GPU vector multiTAP correlator class
|
|
* \authors <ul>
|
|
* <li> Javier Arribas, 2015. jarribas(at)cttc.es
|
|
* </ul>
|
|
*
|
|
* Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
|
|
*
|
|
* -------------------------------------------------------------------------
|
|
*
|
|
* Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
|
|
*
|
|
* GNSS-SDR is a software defined Global Navigation
|
|
* Satellite Systems receiver
|
|
*
|
|
* This file is part of GNSS-SDR.
|
|
*
|
|
* GNSS-SDR is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* GNSS-SDR is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
|
*
|
|
* -------------------------------------------------------------------------
|
|
*/
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// On G80-class hardware 24-bit multiplication takes 4 clocks per warp
|
|
// (the same as for floating point multiplication and addition),
|
|
// whereas full 32-bit multiplication takes 16 clocks per warp.
|
|
// So if integer multiplication operands are guaranteed to fit into 24 bits
|
|
// (always lie withtin [-8M, 8M - 1] range in signed case),
|
|
// explicit 24-bit multiplication is preferred for performance.
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
#define IMUL(a, b) __mul24(a, b)
|
|
|
|
#include "cuda_multicorrelator.h"
|
|
|
|
#include <stdio.h>
|
|
|
|
// For the CUDA runtime routines (prefixed with "cuda_")
|
|
#include <cuda_runtime.h>
|
|
|
|
// helper functions and utilities to work with CUDA
|
|
#include <helper_cuda.h>
|
|
#include <helper_functions.h>
|
|
|
|
#define ACCUM_N 1024
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// Calculate scalar products of VectorN vectors of ElementN elements on GPU
|
|
// Parameters restrictions:
|
|
// 1) ElementN is strongly preferred to be a multiple of warp size to
|
|
// meet alignment constraints of memory coalescing.
|
|
// 2) ACCUM_N must be a power of two.
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
__global__ void scalarProdGPUCPXxN_shifts(
|
|
GPU_Complex *d_corr_out,
|
|
GPU_Complex *d_sig_in,
|
|
GPU_Complex *d_local_codes_in,
|
|
int *d_shifts_samples,
|
|
int vectorN,
|
|
int elementN
|
|
)
|
|
{
|
|
//Accumulators cache
|
|
__shared__ GPU_Complex accumResult[ACCUM_N];
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
// Cycle through every pair of vectors,
|
|
// taking into account that vector counts can be different
|
|
// from total number of thread blocks
|
|
////////////////////////////////////////////////////////////////////////////
|
|
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
|
|
{
|
|
int vectorBase = IMUL(elementN, vec);
|
|
int vectorEnd = vectorBase + elementN;
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
// Each accumulator cycles through vectors with
|
|
// stride equal to number of total number of accumulators ACCUM_N
|
|
// At this stage ACCUM_N is only preferred be a multiple of warp size
|
|
// to meet memory coalescing alignment constraints.
|
|
////////////////////////////////////////////////////////////////////////
|
|
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
|
|
{
|
|
GPU_Complex sum = GPU_Complex(0,0);
|
|
|
|
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
|
|
{
|
|
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
|
|
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
|
|
sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos-vectorBase+d_shifts_samples[vec]]);
|
|
}
|
|
accumResult[iAccum] = sum;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
// Perform tree-like reduction of accumulators' results.
|
|
// ACCUM_N has to be power of two at this stage
|
|
////////////////////////////////////////////////////////////////////////
|
|
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
|
|
{
|
|
__syncthreads();
|
|
|
|
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
|
|
{
|
|
accumResult[iAccum] += accumResult[stride + iAccum];
|
|
}
|
|
}
|
|
|
|
if (threadIdx.x == 0)
|
|
{
|
|
d_corr_out[vec] = accumResult[0];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
__global__ void scalarProdGPUCPXxN(
|
|
GPU_Complex *d_corr_out,
|
|
GPU_Complex *d_sig_in,
|
|
GPU_Complex *d_local_codes_in,
|
|
int vectorN,
|
|
int elementN
|
|
)
|
|
{
|
|
//Accumulators cache
|
|
__shared__ GPU_Complex accumResult[ACCUM_N];
|
|
|
|
////////////////////////////////////////////////////////////////////////////
|
|
// Cycle through every pair of vectors,
|
|
// taking into account that vector counts can be different
|
|
// from total number of thread blocks
|
|
////////////////////////////////////////////////////////////////////////////
|
|
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
|
|
{
|
|
int vectorBase = IMUL(elementN, vec);
|
|
int vectorEnd = vectorBase + elementN;
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
// Each accumulator cycles through vectors with
|
|
// stride equal to number of total number of accumulators ACCUM_N
|
|
// At this stage ACCUM_N is only preferred be a multiple of warp size
|
|
// to meet memory coalescing alignment constraints.
|
|
////////////////////////////////////////////////////////////////////////
|
|
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
|
|
{
|
|
GPU_Complex sum = GPU_Complex(0,0);
|
|
|
|
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
|
|
{
|
|
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
|
|
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
|
|
sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]);
|
|
}
|
|
accumResult[iAccum] = sum;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////
|
|
// Perform tree-like reduction of accumulators' results.
|
|
// ACCUM_N has to be power of two at this stage
|
|
////////////////////////////////////////////////////////////////////////
|
|
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
|
|
{
|
|
__syncthreads();
|
|
|
|
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
|
|
{
|
|
accumResult[iAccum] += accumResult[stride + iAccum];
|
|
}
|
|
}
|
|
|
|
if (threadIdx.x == 0)
|
|
{
|
|
d_corr_out[vec] = accumResult[0];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
//*********** CUDA processing **************
|
|
// Treads: a minimal parallel execution code on GPU
|
|
// Blocks: a set of N threads
|
|
/**
|
|
* CUDA Kernel Device code
|
|
*
|
|
* Computes the vectorial product of A and B into C. The 3 vectors have the same
|
|
* number of elements numElements.
|
|
*/
|
|
__global__ void CUDA_32fc_x2_multiply_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
|
|
{
|
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
if (i < numElements)
|
|
{
|
|
C[i] = A[i] * B[i];
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* CUDA Kernel Device code
|
|
*
|
|
* Computes the carrier Doppler wipe-off by integrating the NCO in the CUDA kernel
|
|
*/
|
|
__global__ void
|
|
CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
|
|
{
|
|
//*** NCO CPU code (GNURadio FXP NCO)
|
|
//float sin_f, cos_f;
|
|
//float phase_step_rad = static_cast<float>(2 * GALILEO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
|
|
//int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad);
|
|
//int phase_rad_i = gr::fxpt::float_to_fixed(d_rem_carr_phase_rad);
|
|
//
|
|
//for(int i = 0; i < d_current_prn_length_samples; i++)
|
|
// {
|
|
// gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f);
|
|
// d_carr_sign[i] = std::complex<float>(cos_f, -sin_f);
|
|
// phase_rad_i += phase_step_rad_i;
|
|
// }
|
|
|
|
// CUDA version of floating point NCO and vector dot product integrated
|
|
|
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
float sin;
|
|
float cos;
|
|
if (i < numElements)
|
|
{
|
|
__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
|
|
sig_out[i] = sig_in[i] * GPU_Complex(cos,-sin);
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* CUDA Kernel Device code
|
|
*
|
|
* Computes the vectorial product of A and B into C. The 3 vectors have the same
|
|
* number of elements numElements.
|
|
*/
|
|
__global__ void
|
|
CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
|
|
{
|
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
|
if (i < numElements)
|
|
{
|
|
C[i] = A[i] * B[i];
|
|
}
|
|
}
|
|
|
|
|
|
bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators)
|
|
{
|
|
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
|
|
findCudaDevice(argc, (const char **)argv);
|
|
|
|
cudaDeviceProp prop;
|
|
int whichDevice;
|
|
cudaGetDevice( &whichDevice );
|
|
cudaGetDeviceProperties( &prop, whichDevice );
|
|
//debug code
|
|
if (prop.canMapHostMemory != 1) {
|
|
printf( "Device can not map memory.\n" );
|
|
}
|
|
|
|
printf("L2 Cache size= %u \n",prop.l2CacheSize);
|
|
printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
|
|
printf("maxGridSize= %i \n",prop.maxGridSize[0]);
|
|
printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
|
|
printf("deviceOverlap= %i \n",prop.deviceOverlap);
|
|
//end debug code
|
|
|
|
//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
|
|
|
|
|
|
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
|
|
|
|
size_t size = signal_length_samples * sizeof(GPU_Complex);
|
|
|
|
checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
|
|
//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
|
|
checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
|
|
|
|
// old version: all local codes are independent vectors
|
|
//checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
|
|
|
|
// new version: only one vector with extra samples to shift the local code for the correlator set
|
|
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
|
checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size+sizeof(GPU_Complex)*shifts_samples[n_correlators-1]));
|
|
checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, size+sizeof(int)*n_correlators));
|
|
|
|
//scalars
|
|
checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
|
|
|
|
// Launch the Vector Add CUDA Kernel
|
|
threadsPerBlock = 256;
|
|
blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
|
|
std::complex<float>* corr_out,
|
|
const std::complex<float>* sig_in,
|
|
const std::complex<float>* local_codes_in,
|
|
float rem_carrier_phase_in_rad,
|
|
float phase_step_rad,
|
|
const int *shifts_samples,
|
|
int signal_length_samples,
|
|
int n_correlators)
|
|
{
|
|
|
|
cudaStream_t stream1;
|
|
cudaStream_t stream2;
|
|
cudaStreamCreate ( &stream1) ;
|
|
cudaStreamCreate ( &stream2) ;
|
|
|
|
size_t memSize = signal_length_samples * sizeof(std::complex<float>);
|
|
|
|
// input signal CPU -> GPU copy memory
|
|
checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
|
|
cudaMemcpyHostToDevice, stream1));
|
|
|
|
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
|
|
//checkCudaErrors(cudaMemcpyAsync(d_nco_in, nco_in, memSize,
|
|
// cudaMemcpyHostToDevice, stream1));
|
|
|
|
|
|
// old version: all local codes are independent vectors
|
|
//checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
|
|
// cudaMemcpyHostToDevice, stream2));
|
|
|
|
// new version: only one vector with extra samples to shift the local code for the correlator set
|
|
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
|
|
|
// local code CPU -> GPU copy memory
|
|
checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
|
|
cudaMemcpyHostToDevice, stream2));
|
|
// Correlator shifts vector CPU -> GPU copy memory
|
|
checkCudaErrors(cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
|
|
cudaMemcpyHostToDevice, stream2));
|
|
|
|
|
|
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
|
|
checkCudaErrors(cudaStreamSynchronize(stream1));
|
|
CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
|
|
|
|
|
|
//printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
|
|
|
|
//wait for Doppler wipeoff end...
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
|
|
//old
|
|
// scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
|
|
// d_corr_out,
|
|
// d_sig_doppler_wiped,
|
|
// d_local_codes_in,
|
|
// 3,
|
|
// signal_length_samples
|
|
// );
|
|
|
|
//new
|
|
//launch the multitap correlator
|
|
scalarProdGPUCPXxN_shifts<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
|
|
d_corr_out,
|
|
d_sig_doppler_wiped,
|
|
d_local_codes_in,
|
|
d_shifts_samples,
|
|
n_correlators,
|
|
signal_length_samples
|
|
);
|
|
checkCudaErrors(cudaGetLastError());
|
|
//wait for correlators end...
|
|
checkCudaErrors(cudaDeviceSynchronize());
|
|
// Copy the device result vector in device memory to the host result vector
|
|
// in host memory.
|
|
|
|
//scalar products (correlators outputs)
|
|
checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
|
|
cudaMemcpyDeviceToHost, 0));
|
|
|
|
cudaStreamDestroy(stream1) ;
|
|
cudaStreamDestroy(stream2) ;
|
|
return true;
|
|
}
|
|
|
|
bool cuda_multicorrelator::free_cuda()
|
|
{
|
|
// Free device global memory
|
|
cudaFree(d_sig_in);
|
|
//cudaFree(d_nco_in);
|
|
cudaFree(d_local_codes_in);
|
|
cudaFree(d_corr_out);
|
|
|
|
// Reset the device and exit
|
|
// cudaDeviceReset causes the driver to clean up all state. While
|
|
// not mandatory in normal operation, it is good practice. It is also
|
|
// needed to ensure correct operation when the application is being
|
|
// profiled. Calling cudaDeviceReset causes all profile data to be
|
|
// flushed before the application exits
|
|
checkCudaErrors(cudaDeviceReset());
|
|
return true;
|
|
}
|
|
|