1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2025-10-30 23:03:05 +00:00

Some CUDA cleaning and documentation

This commit is contained in:
Carles Fernandez
2015-08-25 19:01:02 +02:00
parent 429e4e8776
commit ef136e5c74
6 changed files with 154 additions and 138 deletions

View File

@@ -51,7 +51,6 @@ class ConfigurationInterface;
class GpsL1CaDllPllTrackingGPU : public TrackingInterface
{
public:
GpsL1CaDllPllTrackingGPU(ConfigurationInterface* configuration,
std::string role,
unsigned int in_streams,
@@ -65,7 +64,7 @@ public:
return role_;
}
//! Returns "GPS_L1_CA_DLL_PLL_Tracking"
//! Returns "GPS_L1_CA_DLL_PLL_Tracking_GPU"
std::string implementation()
{
return "GPS_L1_CA_DLL_PLL_Tracking_GPU";

View File

@@ -123,7 +123,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment()));
multicorrelator_gpu = new cuda_multicorrelator();
int N_CORRELATORS=3;
int N_CORRELATORS = 3;
//local code resampler on CPU (old)
//multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS);
@@ -131,22 +131,22 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS);
// Get space for the resampled early / prompt / late local replicas
checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float), cudaHostAllocMapped ));
checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float), cudaHostAllocMapped ));
//allocate host memory
//pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length * sizeof(gr_complex), cudaHostAllocMapped ));
checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length * sizeof(gr_complex), cudaHostAllocMapped ));
//old local codes vector
//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
//old local codes vector
//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
//new integrated shifts
//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
//new integrated shifts
//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
// correlator outputs (scalar)
checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS, cudaHostAllocWriteCombined ));
//map to EPL pointers
// correlator outputs (scalar)
checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS, cudaHostAllocWriteCombined ));
//map to EPL pointers
d_Early = &d_corr_outs_gpu[0];
d_Prompt = &d_corr_outs_gpu[1];
d_Late = &d_corr_outs_gpu[2];
@@ -284,13 +284,13 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
{
d_dump_file.close();
cudaFreeHost(in_gpu);
cudaFreeHost(d_carr_sign_gpu);
cudaFreeHost(d_corr_outs_gpu);
cudaFreeHost(d_local_code_shift_chips);
cudaFreeHost(in_gpu);
cudaFreeHost(d_carr_sign_gpu);
cudaFreeHost(d_corr_outs_gpu);
cudaFreeHost(d_local_code_shift_chips);
multicorrelator_gpu->free_cuda();
delete(multicorrelator_gpu);
multicorrelator_gpu->free_cuda();
delete(multicorrelator_gpu);
volk_free(d_ca_code);
@@ -339,7 +339,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
// UPDATE NCO COMMAND
float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
//code resampler on GPU (new)
//code resampler on GPU (new)
float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in);
float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in);
@@ -353,7 +353,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
rem_code_phase_chips,
d_current_prn_length_samples,
3);
cudaProfilerStop();
cudaProfilerStop();
// ################## PLL ##########################################################
// PLL discriminator

View File

@@ -130,7 +130,7 @@ private:
gr_complex* in_gpu;
gr_complex* d_carr_sign_gpu;
gr_complex* d_local_codes_gpu;
float* d_local_code_shift_chips;
float* d_local_code_shift_chips;
gr_complex* d_corr_outs_gpu;
cuda_multicorrelator *multicorrelator_gpu;

View File

@@ -45,67 +45,67 @@
#endif
#include <complex>
#include <cuda.h>
// CUDA runtime
#include <cuda_runtime.h>
// GPU new internal data types for complex numbers
struct GPU_Complex {
float r;
float i;
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
return r * r + i * i;
}
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) {
#ifdef __CUDACC__
return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i));
#else
return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
#endif
}
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) {
return GPU_Complex(r+a.r, i+a.i);
}
CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) {
r+=a.r;
i+=a.i;
}
CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
{
//c=a*b+c
//real part
//c.r=(a.r*b.r - a.i*b.i)+c.r
#ifdef __CUDACC__
r=__fmaf_rn(a.r,b.r,r);
r=__fmaf_rn(-a.i,b.i,r);
//imag part
i=__fmaf_rn(a.i,b.r,i);
i=__fmaf_rn(a.r,b.i,i);
#else
r=(a.r*b.r - a.i*b.i)+r;
i=(a.i*b.r - a.r*b.i)+i;
#endif
struct GPU_Complex
{
float r;
float i;
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { return r * r + i * i; }
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a)
{
#ifdef __CUDACC__
return GPU_Complex(__fmul_rn(r, a.r) - __fmul_rn(i, a.i), __fmul_rn(i, a.r) + __fmul_rn(r, a.i));
#else
return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
#endif
}
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a)
{
return GPU_Complex(r + a.r, i + a.i);
}
CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) { r += a.r; i += a.i; }
CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
{
//c=a*b+c
//real part
//c.r=(a.r*b.r - a.i*b.i)+c.r
#ifdef __CUDACC__
r = __fmaf_rn(a.r, b.r, r);
r = __fmaf_rn(-a.i, b.i, r);
//imag part
i = __fmaf_rn(a.i, b.r, i);
i = __fmaf_rn(a.r, b.i, i);
#else
r = (a.r*b.r - a.i*b.i) + r;
i = (a.i*b.r - a.r*b.i) + i;
#endif
}
}
};
struct GPU_Complex_Short {
float r;
float i;
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
return r * r + i * i;
}
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) {
return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
}
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) {
return GPU_Complex_Short(r+a.r, i+a.i);
}
struct GPU_Complex_Short
{
float r;
float i;
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void )
{
return r * r + i * i;
}
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a)
{
return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
}
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a)
{
return GPU_Complex_Short(r+a.r, i+a.i);
}
};
/*!
* \brief Class that implements carrier wipe-off and correlators using NVIDIA CUDA GPU accelerators.
@@ -113,58 +113,58 @@ struct GPU_Complex_Short {
class cuda_multicorrelator
{
public:
cuda_multicorrelator();
bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
bool init_cuda_integrated_resampler(
const int argc, const char **argv,
int signal_length_samples,
int code_length_chips,
int n_correlators
);
bool set_local_code_and_taps(
int code_length_chips,
const std::complex<float>* local_codes_in,
float *shifts_chips,
int n_correlators
);
bool free_cuda();
bool Carrier_wipeoff_multicorrelator_cuda(
std::complex<float>* corr_out,
const std::complex<float>* sig_in,
const std::complex<float>* local_codes_in,
float rem_carrier_phase_in_rad,
float phase_step_rad,
const int *shifts_samples,
int signal_length_samples,
int n_correlators);
bool Carrier_wipeoff_multicorrelator_resampler_cuda(
std::complex<float>* corr_out,
const std::complex<float>* sig_in,
float rem_carrier_phase_in_rad,
float phase_step_rad,
float code_phase_step_chips,
float rem_code_phase_chips,
int signal_length_samples,
int n_correlators);
cuda_multicorrelator();
bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
bool init_cuda_integrated_resampler(
const int argc, const char **argv,
int signal_length_samples,
int code_length_chips,
int n_correlators
);
bool set_local_code_and_taps(
int code_length_chips,
const std::complex<float>* local_codes_in,
float *shifts_chips,
int n_correlators
);
bool free_cuda();
bool Carrier_wipeoff_multicorrelator_cuda(
std::complex<float>* corr_out,
const std::complex<float>* sig_in,
const std::complex<float>* local_codes_in,
float rem_carrier_phase_in_rad,
float phase_step_rad,
const int *shifts_samples,
int signal_length_samples,
int n_correlators);
bool Carrier_wipeoff_multicorrelator_resampler_cuda(
std::complex<float>* corr_out,
const std::complex<float>* sig_in,
float rem_carrier_phase_in_rad,
float phase_step_rad,
float code_phase_step_chips,
float rem_code_phase_chips,
int signal_length_samples,
int n_correlators);
private:
// Allocate the device input vectors
GPU_Complex *d_sig_in;
GPU_Complex *d_nco_in;
GPU_Complex *d_sig_doppler_wiped;
GPU_Complex *d_local_codes_in;
GPU_Complex *d_corr_out;
int *d_shifts_samples;
float *d_shifts_chips;
float d_code_length_chips;
// Allocate the device input vectors
GPU_Complex *d_sig_in;
GPU_Complex *d_nco_in;
GPU_Complex *d_sig_doppler_wiped;
GPU_Complex *d_local_codes_in;
GPU_Complex *d_corr_out;
int *d_shifts_samples;
float *d_shifts_chips;
float d_code_length_chips;
int threadsPerBlock;
int blocksPerGrid;
cudaStream_t stream1;
cudaStream_t stream2;
int num_gpu_devices;
int selected_device;
int threadsPerBlock;
int blocksPerGrid;
cudaStream_t stream1;
cudaStream_t stream2;
int num_gpu_devices;
int selected_device;
};