From 4fc61af172469968c4bd7821cd3633ddea077f6c Mon Sep 17 00:00:00 2001 From: Javier Arribas Date: Wed, 22 Jul 2015 18:16:54 +0200 Subject: [PATCH 1/5] Adding cuda ultra-fast correlator library. Not used yet, but optionally compiled. All CMAKEs ready! --- CMakeLists.txt | 14 + src/algorithms/tracking/libs/CMakeLists.txt | 28 +- .../tracking/libs/cuda_multicorrelator.cu | 418 +++++ .../tracking/libs/cuda_multicorrelator.h | 138 ++ .../tracking/libs/cudahelpers/exception.h | 151 ++ .../tracking/libs/cudahelpers/helper_cuda.h | 1255 ++++++++++++++ .../libs/cudahelpers/helper_cuda_drvapi.h | 517 ++++++ .../libs/cudahelpers/helper_cuda_gl.h | 165 ++ .../libs/cudahelpers/helper_functions.h | 42 + .../tracking/libs/cudahelpers/helper_image.h | 1110 +++++++++++++ .../tracking/libs/cudahelpers/helper_math.h | 1453 +++++++++++++++++ .../tracking/libs/cudahelpers/helper_string.h | 516 ++++++ .../tracking/libs/cudahelpers/helper_timer.h | 499 ++++++ 13 files changed, 6304 insertions(+), 2 deletions(-) create mode 100644 src/algorithms/tracking/libs/cuda_multicorrelator.cu create mode 100644 src/algorithms/tracking/libs/cuda_multicorrelator.h create mode 100644 src/algorithms/tracking/libs/cudahelpers/exception.h create mode 100644 src/algorithms/tracking/libs/cudahelpers/helper_cuda.h create mode 100644 src/algorithms/tracking/libs/cudahelpers/helper_cuda_drvapi.h create mode 100644 src/algorithms/tracking/libs/cudahelpers/helper_cuda_gl.h create mode 100644 src/algorithms/tracking/libs/cudahelpers/helper_functions.h create mode 100644 src/algorithms/tracking/libs/cudahelpers/helper_image.h create mode 100644 src/algorithms/tracking/libs/cudahelpers/helper_math.h create mode 100644 src/algorithms/tracking/libs/cudahelpers/helper_string.h create mode 100644 src/algorithms/tracking/libs/cudahelpers/helper_timer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index b6ee58a87..2cc688523 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -957,6 +957,20 @@ else(ENABLE_OSMOSDR) message(STATUS "Enable it with 'cmake -DENABLE_OSMOSDR=ON ../' to add support for OsmoSDR and other front-ends (HackRF, bladeRF, Realtek's RTL2832U-based USB dongles, etc.)" ) endif(ENABLE_OSMOSDR) +if($ENV{CUDA_GPU_ACCEL}) + message(STATUS "CUDA_GPU_ACCEL environment variable found." ) + set(ENABLE_CUDA ON) +endif($ENV{CUDA_GPU_ACCEL}) + +if(ENABLE_CUDA) + message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." ) + message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" ) +else(ENABLE_CUDA) + message(STATUS "NVIDIA CUDA GPU Acceleration will is not enabled." ) + message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for the Teleorbit Flexiband front-end." ) +endif(ENABLE_CUDA) + + if($ENV{FLEXIBAND_DRIVER}) message(STATUS "FLEXIBAND_DRIVER environment variable found." ) set(ENABLE_FLEXIBAND ON) diff --git a/src/algorithms/tracking/libs/CMakeLists.txt b/src/algorithms/tracking/libs/CMakeLists.txt index 52470bc3b..aa5fb0036 100644 --- a/src/algorithms/tracking/libs/CMakeLists.txt +++ b/src/algorithms/tracking/libs/CMakeLists.txt @@ -16,6 +16,29 @@ # along with GNSS-SDR. If not, see . # + +if(ENABLE_CUDA) + FIND_PACKAGE(CUDA REQUIRED) + + # Append current NVCC flags by something, eg comput capability + # set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30 --default-stream-per-thread) + + list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_30,code=sm_30; -std=c++11;-O3; -use_fast_math") + SET(CUDA_PROPAGATE_HOST_FLAGS OFF) + + CUDA_INCLUDE_DIRECTORIES( + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers + ) + + SET(LIB_TYPE STATIC) #set the lib type + CUDA_ADD_LIBRARY(CUDA_CORRELATOR_LIB ${LIB_TYPE} cuda_multicorrelator.h cuda_multicorrelator.cu) + + set(OPT_LIBRARIES ${OPT_LIBRARIES} ${CUDA_CORRELATOR_LIB}) + +endif(ENABLE_CUDA) + + set(TRACKING_LIB_SOURCES correlator.cc lock_detectors.cc @@ -24,7 +47,7 @@ set(TRACKING_LIB_SOURCES tracking_2nd_DLL_filter.cc tracking_2nd_PLL_filter.cc tracking_discriminators.cc - tracking_FLL_PLL_filter.cc + tracking_FLL_PLL_filter.cc ) include_directories( @@ -43,7 +66,8 @@ if (SSE3_AVAILABLE) add_definitions( -DHAVE_SSE3=1 ) endif(SSE3_AVAILABLE) + file(GLOB TRACKING_LIB_HEADERS "*.h") add_library(tracking_lib ${TRACKING_LIB_SOURCES} ${TRACKING_LIB_HEADERS}) source_group(Headers FILES ${TRACKING_LIB_HEADERS}) -target_link_libraries(tracking_lib ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES}) \ No newline at end of file +target_link_libraries(tracking_lib ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES} ${OPT_LIBRARIES}) \ No newline at end of file diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu new file mode 100644 index 000000000..1909e13af --- /dev/null +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu @@ -0,0 +1,418 @@ +/*! + * \file cuda_multicorrelator.cu + * \brief High optimized CUDA GPU vector multiTAP correlator class + * \authors + * + * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +/////////////////////////////////////////////////////////////////////////////// +// On G80-class hardware 24-bit multiplication takes 4 clocks per warp +// (the same as for floating point multiplication and addition), +// whereas full 32-bit multiplication takes 16 clocks per warp. +// So if integer multiplication operands are guaranteed to fit into 24 bits +// (always lie withtin [-8M, 8M - 1] range in signed case), +// explicit 24-bit multiplication is preferred for performance. +/////////////////////////////////////////////////////////////////////////////// +#define IMUL(a, b) __mul24(a, b) + +#include "cuda_multicorrelator.h" + +#include + +// For the CUDA runtime routines (prefixed with "cuda_") +#include + +// helper functions and utilities to work with CUDA +#include +#include + +#define ACCUM_N 1024 + +/////////////////////////////////////////////////////////////////////////////// +// Calculate scalar products of VectorN vectors of ElementN elements on GPU +// Parameters restrictions: +// 1) ElementN is strongly preferred to be a multiple of warp size to +// meet alignment constraints of memory coalescing. +// 2) ACCUM_N must be a power of two. +/////////////////////////////////////////////////////////////////////////////// + + +__global__ void scalarProdGPUCPXxN_shifts( + GPU_Complex *d_corr_out, + GPU_Complex *d_sig_in, + GPU_Complex *d_local_codes_in, + int *d_shifts_samples, + int vectorN, + int elementN +) +{ + //Accumulators cache + __shared__ GPU_Complex accumResult[ACCUM_N]; + + //////////////////////////////////////////////////////////////////////////// + // Cycle through every pair of vectors, + // taking into account that vector counts can be different + // from total number of thread blocks + //////////////////////////////////////////////////////////////////////////// + for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) + { + int vectorBase = IMUL(elementN, vec); + int vectorEnd = vectorBase + elementN; + + //////////////////////////////////////////////////////////////////////// + // Each accumulator cycles through vectors with + // stride equal to number of total number of accumulators ACCUM_N + // At this stage ACCUM_N is only preferred be a multiple of warp size + // to meet memory coalescing alignment constraints. + //////////////////////////////////////////////////////////////////////// + for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) + { + GPU_Complex sum = GPU_Complex(0,0); + + for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) + { + //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; + //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; + sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos-vectorBase+d_shifts_samples[vec]]); + } + accumResult[iAccum] = sum; + } + + //////////////////////////////////////////////////////////////////////// + // Perform tree-like reduction of accumulators' results. + // ACCUM_N has to be power of two at this stage + //////////////////////////////////////////////////////////////////////// + for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) + { + __syncthreads(); + + for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) + { + accumResult[iAccum] += accumResult[stride + iAccum]; + } + } + + if (threadIdx.x == 0) + { + d_corr_out[vec] = accumResult[0]; + } + } +} + + +__global__ void scalarProdGPUCPXxN( + GPU_Complex *d_corr_out, + GPU_Complex *d_sig_in, + GPU_Complex *d_local_codes_in, + int vectorN, + int elementN +) +{ + //Accumulators cache + __shared__ GPU_Complex accumResult[ACCUM_N]; + + //////////////////////////////////////////////////////////////////////////// + // Cycle through every pair of vectors, + // taking into account that vector counts can be different + // from total number of thread blocks + //////////////////////////////////////////////////////////////////////////// + for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) + { + int vectorBase = IMUL(elementN, vec); + int vectorEnd = vectorBase + elementN; + + //////////////////////////////////////////////////////////////////////// + // Each accumulator cycles through vectors with + // stride equal to number of total number of accumulators ACCUM_N + // At this stage ACCUM_N is only preferred be a multiple of warp size + // to meet memory coalescing alignment constraints. + //////////////////////////////////////////////////////////////////////// + for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) + { + GPU_Complex sum = GPU_Complex(0,0); + + for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) + { + //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; + //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; + sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]); + } + accumResult[iAccum] = sum; + } + + //////////////////////////////////////////////////////////////////////// + // Perform tree-like reduction of accumulators' results. + // ACCUM_N has to be power of two at this stage + //////////////////////////////////////////////////////////////////////// + for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) + { + __syncthreads(); + + for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) + { + accumResult[iAccum] += accumResult[stride + iAccum]; + } + } + + if (threadIdx.x == 0) + { + d_corr_out[vec] = accumResult[0]; + } + } +} + + +//*********** CUDA processing ************** +// Treads: a minimal parallel execution code on GPU +// Blocks: a set of N threads +/** + * CUDA Kernel Device code + * + * Computes the vectorial product of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ +__global__ void CUDA_32fc_x2_multiply_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + C[i] = A[i] * B[i]; + } +} + + +/** + * CUDA Kernel Device code + * + * Computes the carrier Doppler wipe-off by integrating the NCO in the CUDA kernel + */ +__global__ void +CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements) +{ + //*** NCO CPU code (GNURadio FXP NCO) + //float sin_f, cos_f; + //float phase_step_rad = static_cast(2 * GALILEO_PI) * d_carrier_doppler_hz / static_cast(d_fs_in); + //int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad); + //int phase_rad_i = gr::fxpt::float_to_fixed(d_rem_carr_phase_rad); + // + //for(int i = 0; i < d_current_prn_length_samples; i++) + // { + // gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f); + // d_carr_sign[i] = std::complex(cos_f, -sin_f); + // phase_rad_i += phase_step_rad_i; + // } + + // CUDA version of floating point NCO and vector dot product integrated + + int i = blockDim.x * blockIdx.x + threadIdx.x; + float sin; + float cos; + if (i < numElements) + { + __sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos); + sig_out[i] = sig_in[i] * GPU_Complex(cos,-sin); + } +} + + +/** + * CUDA Kernel Device code + * + * Computes the vectorial product of A and B into C. The 3 vectors have the same + * number of elements numElements. + */ +__global__ void +CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + C[i] = A[i] * B[i]; + } +} + + +bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators) +{ + // use command-line specified CUDA device, otherwise use device with highest Gflops/s + findCudaDevice(argc, (const char **)argv); + + cudaDeviceProp prop; + int whichDevice; + cudaGetDevice( &whichDevice ); + cudaGetDeviceProperties( &prop, whichDevice ); + //debug code + if (prop.canMapHostMemory != 1) { + printf( "Device can not map memory.\n" ); + } + + printf("L2 Cache size= %u \n",prop.l2CacheSize); + printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); + printf("maxGridSize= %i \n",prop.maxGridSize[0]); + printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); + printf("deviceOverlap= %i \n",prop.deviceOverlap); + //end debug code + + //checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); + + + // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors + + size_t size = signal_length_samples * sizeof(GPU_Complex); + + checkCudaErrors(cudaMalloc((void **)&d_sig_in, size)); + //checkCudaErrors(cudaMalloc((void **)&d_nco_in, size)); + checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size)); + + // old version: all local codes are independent vectors + //checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size*n_correlators)); + + // new version: only one vector with extra samples to shift the local code for the correlator set + // Required: The last correlator tap in d_shifts_samples has the largest sample shift + checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size+sizeof(GPU_Complex)*shifts_samples[n_correlators-1])); + checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, size+sizeof(int)*n_correlators)); + + //scalars + checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators)); + + // Launch the Vector Add CUDA Kernel + threadsPerBlock = 256; + blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; + + return true; +} + + +bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( + std::complex* corr_out, + const std::complex* sig_in, + const std::complex* local_codes_in, + float rem_carrier_phase_in_rad, + float phase_step_rad, + const int *shifts_samples, + int signal_length_samples, + int n_correlators) + { + + cudaStream_t stream1; + cudaStream_t stream2; + cudaStreamCreate ( &stream1) ; + cudaStreamCreate ( &stream2) ; + + size_t memSize = signal_length_samples * sizeof(std::complex); + + // input signal CPU -> GPU copy memory + checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize, + cudaMemcpyHostToDevice, stream1)); + + //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** + //checkCudaErrors(cudaMemcpyAsync(d_nco_in, nco_in, memSize, + // cudaMemcpyHostToDevice, stream1)); + + + // old version: all local codes are independent vectors + //checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators, + // cudaMemcpyHostToDevice, stream2)); + + // new version: only one vector with extra samples to shift the local code for the correlator set + // Required: The last correlator tap in d_shifts_samples has the largest sample shift + + // local code CPU -> GPU copy memory + checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex)*shifts_samples[n_correlators-1], + cudaMemcpyHostToDevice, stream2)); + // Correlator shifts vector CPU -> GPU copy memory + checkCudaErrors(cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators, + cudaMemcpyHostToDevice, stream2)); + + + //Launch carrier wipe-off kernel here, while local codes are being copied to GPU! + checkCudaErrors(cudaStreamSynchronize(stream1)); + CUDA_32fc_Doppler_wipeoff<<>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples); + + + //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); + + //wait for Doppler wipeoff end... + checkCudaErrors(cudaDeviceSynchronize()); + + //old +// scalarProdGPUCPXxN<<>>( +// d_corr_out, +// d_sig_doppler_wiped, +// d_local_codes_in, +// 3, +// signal_length_samples +// ); + + //new + //launch the multitap correlator + scalarProdGPUCPXxN_shifts<<>>( + d_corr_out, + d_sig_doppler_wiped, + d_local_codes_in, + d_shifts_samples, + n_correlators, + signal_length_samples + ); + checkCudaErrors(cudaGetLastError()); + //wait for correlators end... + checkCudaErrors(cudaDeviceSynchronize()); + // Copy the device result vector in device memory to the host result vector + // in host memory. + + //scalar products (correlators outputs) + checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex)*n_correlators, + cudaMemcpyDeviceToHost, 0)); + + cudaStreamDestroy(stream1) ; + cudaStreamDestroy(stream2) ; + return true; +} + +bool cuda_multicorrelator::free_cuda() +{ + // Free device global memory + cudaFree(d_sig_in); + //cudaFree(d_nco_in); + cudaFree(d_local_codes_in); + cudaFree(d_corr_out); + + // Reset the device and exit + // cudaDeviceReset causes the driver to clean up all state. While + // not mandatory in normal operation, it is good practice. It is also + // needed to ensure correct operation when the application is being + // profiled. Calling cudaDeviceReset causes all profile data to be + // flushed before the application exits + checkCudaErrors(cudaDeviceReset()); + return true; +} + diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h new file mode 100644 index 000000000..8819edf9a --- /dev/null +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h @@ -0,0 +1,138 @@ +/*! + * \file cuda_multicorrelator.h + * \brief High optimized CUDA GPU vector multiTAP correlator class + * \authors
    + *
  • Javier Arribas, 2015. jarribas(at)cttc.es + *
+ * + * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef CUDA_MULTICORRELATOR_H_ +#define CUDA_MULTICORRELATOR_H_ + + +#ifdef __CUDACC__ +#define CUDA_CALLABLE_MEMBER_GLOBAL __global__ +#define CUDA_CALLABLE_MEMBER_DEVICE __device__ +#else +#define CUDA_CALLABLE_MEMBER_GLOBAL +#define CUDA_CALLABLE_MEMBER_DEVICE +#endif + +#include + +// GPU new internal data types for complex numbers + +struct GPU_Complex { + float r; + float i; + CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {}; + CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {} + CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { + return r * r + i * i; + } + CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) { + #ifdef __CUDACC__ + return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i)); + #else + return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i); + #endif + } + CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) { + return GPU_Complex(r+a.r, i+a.i); + } + CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) { + r+=a.r; + i+=a.i; + } + CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b) + { + //c=a*b+c + //real part + //c.r=(a.r*b.r - a.i*b.i)+c.r + #ifdef __CUDACC__ + r=__fmaf_rn(a.r,b.r,r); + r=__fmaf_rn(-a.i,b.i,r); + //imag part + i=__fmaf_rn(a.i,b.r,i); + i=__fmaf_rn(a.r,b.i,i); + #else + r=(a.r*b.r - a.i*b.i)+r; + i=(a.i*b.r - a.r*b.i)+i; + #endif + + } +}; + +struct GPU_Complex_Short { + float r; + float i; + CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {} + CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) { + return r * r + i * i; + } + CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) { + return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i); + } + CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) { + return GPU_Complex_Short(r+a.r, i+a.i); + } +}; +/*! + * \brief Class that implements carrier wipe-off and correlators using NVIDIA CUDA GPU accelerators. + */ +class cuda_multicorrelator +{ +public: + bool init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators); + + bool free_cuda(); + bool Carrier_wipeoff_multicorrelator_cuda( + std::complex* corr_out, + const std::complex* sig_in, + const std::complex* local_codes_in, + float rem_carrier_phase_in_rad, + float phase_step_rad, + const int *shifts_samples, + int signal_length_samples, + int n_correlators); +private: + // Allocate the device input vectors + GPU_Complex *d_sig_in; + GPU_Complex *d_nco_in; + GPU_Complex *d_sig_doppler_wiped; + GPU_Complex *d_local_codes_in; + GPU_Complex *d_corr_out; + int *d_shifts_samples; + int threadsPerBlock; + int blocksPerGrid; + +}; + + +#endif /* CUDA_MULTICORRELATOR_H_ */ diff --git a/src/algorithms/tracking/libs/cudahelpers/exception.h b/src/algorithms/tracking/libs/cudahelpers/exception.h new file mode 100644 index 000000000..adda4bce6 --- /dev/null +++ b/src/algorithms/tracking/libs/cudahelpers/exception.h @@ -0,0 +1,151 @@ +/* +* Copyright 1993-2013 NVIDIA Corporation. All rights reserved. +* +* Please refer to the NVIDIA end user license agreement (EULA) associated +* with this source code for terms and conditions that govern your use of +* this software. Any use, reproduction, disclosure, or distribution of +* this software and related documentation outside the terms of the EULA +* is strictly prohibited. +* +*/ + +/* CUda UTility Library */ +#ifndef _EXCEPTION_H_ +#define _EXCEPTION_H_ + +// includes, system +#include +#include +#include +#include + +//! Exception wrapper. +//! @param Std_Exception Exception out of namespace std for easy typing. +template +class Exception : public Std_Exception +{ + public: + + //! @brief Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, + const int line, + const char *detailed = "-"); + + //! Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, + const int line, + const std::string &detailed); + + //! Destructor + virtual ~Exception() throw(); + + private: + + //! Constructor, default (private) + Exception(); + + //! Constructor, standard + //! @param str string returned by what() + Exception(const std::string &str); + +}; + +//////////////////////////////////////////////////////////////////////////////// +//! Exception handler function for arbitrary exceptions +//! @param ex exception to handle +//////////////////////////////////////////////////////////////////////////////// +template +inline void +handleException(const Exception_Typ &ex) +{ + std::cerr << ex.what() << std::endl; + + exit(EXIT_FAILURE); +} + +//! Convenience macros + +//! Exception caused by dynamic program behavior, e.g. file does not exist +#define RUNTIME_EXCEPTION( msg) \ + Exception::throw_it( __FILE__, __LINE__, msg) + +//! Logic exception in program, e.g. an assert failed +#define LOGIC_EXCEPTION( msg) \ + Exception::throw_it( __FILE__, __LINE__, msg) + +//! Out of range exception +#define RANGE_EXCEPTION( msg) \ + Exception::throw_it( __FILE__, __LINE__, msg) + +//////////////////////////////////////////////////////////////////////////////// +//! Implementation + +// includes, system +#include + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void +Exception:: +throw_it(const char *file, const int line, const char *detailed) +{ + std::stringstream s; + + // Quiet heavy-weight but exceptions are not for + // performance / release versions + s << "Exception in file '" << file << "' in line " << line << "\n" + << "Detailed description: " << detailed << "\n"; + + throw Exception(s.str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void +Exception:: +throw_it(const char *file, const int line, const std::string &msg) +{ + throw_it(file, line, msg.c_str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, default (private). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception() : + Std_Exception("Unknown Exception.\n") +{ } + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, standard (private). +//! String returned by what(). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception(const std::string &s) : + Std_Exception(s) +{ } + +//////////////////////////////////////////////////////////////////////////////// +//! Destructor +//////////////////////////////////////////////////////////////////////////////// +template +Exception::~Exception() throw() { } + +// functions, exported + +#endif // #ifndef _EXCEPTION_H_ + diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_cuda.h b/src/algorithms/tracking/libs/cudahelpers/helper_cuda.h new file mode 100644 index 000000000..1d3e9202f --- /dev/null +++ b/src/algorithms/tracking/libs/cudahelpers/helper_cuda.h @@ -0,0 +1,1255 @@ +/** + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef HELPER_CUDA_H +#define HELPER_CUDA_H + +#pragma once + +#include +#include +#include + +#include + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header files, please +// refer the CUDA examples for examples of the needed CUDA headers, which may change depending +// on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) +{ + switch (error) + { + case cudaSuccess: + return "cudaSuccess"; + + case cudaErrorMissingConfiguration: + return "cudaErrorMissingConfiguration"; + + case cudaErrorMemoryAllocation: + return "cudaErrorMemoryAllocation"; + + case cudaErrorInitializationError: + return "cudaErrorInitializationError"; + + case cudaErrorLaunchFailure: + return "cudaErrorLaunchFailure"; + + case cudaErrorPriorLaunchFailure: + return "cudaErrorPriorLaunchFailure"; + + case cudaErrorLaunchTimeout: + return "cudaErrorLaunchTimeout"; + + case cudaErrorLaunchOutOfResources: + return "cudaErrorLaunchOutOfResources"; + + case cudaErrorInvalidDeviceFunction: + return "cudaErrorInvalidDeviceFunction"; + + case cudaErrorInvalidConfiguration: + return "cudaErrorInvalidConfiguration"; + + case cudaErrorInvalidDevice: + return "cudaErrorInvalidDevice"; + + case cudaErrorInvalidValue: + return "cudaErrorInvalidValue"; + + case cudaErrorInvalidPitchValue: + return "cudaErrorInvalidPitchValue"; + + case cudaErrorInvalidSymbol: + return "cudaErrorInvalidSymbol"; + + case cudaErrorMapBufferObjectFailed: + return "cudaErrorMapBufferObjectFailed"; + + case cudaErrorUnmapBufferObjectFailed: + return "cudaErrorUnmapBufferObjectFailed"; + + case cudaErrorInvalidHostPointer: + return "cudaErrorInvalidHostPointer"; + + case cudaErrorInvalidDevicePointer: + return "cudaErrorInvalidDevicePointer"; + + case cudaErrorInvalidTexture: + return "cudaErrorInvalidTexture"; + + case cudaErrorInvalidTextureBinding: + return "cudaErrorInvalidTextureBinding"; + + case cudaErrorInvalidChannelDescriptor: + return "cudaErrorInvalidChannelDescriptor"; + + case cudaErrorInvalidMemcpyDirection: + return "cudaErrorInvalidMemcpyDirection"; + + case cudaErrorAddressOfConstant: + return "cudaErrorAddressOfConstant"; + + case cudaErrorTextureFetchFailed: + return "cudaErrorTextureFetchFailed"; + + case cudaErrorTextureNotBound: + return "cudaErrorTextureNotBound"; + + case cudaErrorSynchronizationError: + return "cudaErrorSynchronizationError"; + + case cudaErrorInvalidFilterSetting: + return "cudaErrorInvalidFilterSetting"; + + case cudaErrorInvalidNormSetting: + return "cudaErrorInvalidNormSetting"; + + case cudaErrorMixedDeviceExecution: + return "cudaErrorMixedDeviceExecution"; + + case cudaErrorCudartUnloading: + return "cudaErrorCudartUnloading"; + + case cudaErrorUnknown: + return "cudaErrorUnknown"; + + case cudaErrorNotYetImplemented: + return "cudaErrorNotYetImplemented"; + + case cudaErrorMemoryValueTooLarge: + return "cudaErrorMemoryValueTooLarge"; + + case cudaErrorInvalidResourceHandle: + return "cudaErrorInvalidResourceHandle"; + + case cudaErrorNotReady: + return "cudaErrorNotReady"; + + case cudaErrorInsufficientDriver: + return "cudaErrorInsufficientDriver"; + + case cudaErrorSetOnActiveProcess: + return "cudaErrorSetOnActiveProcess"; + + case cudaErrorInvalidSurface: + return "cudaErrorInvalidSurface"; + + case cudaErrorNoDevice: + return "cudaErrorNoDevice"; + + case cudaErrorECCUncorrectable: + return "cudaErrorECCUncorrectable"; + + case cudaErrorSharedObjectSymbolNotFound: + return "cudaErrorSharedObjectSymbolNotFound"; + + case cudaErrorSharedObjectInitFailed: + return "cudaErrorSharedObjectInitFailed"; + + case cudaErrorUnsupportedLimit: + return "cudaErrorUnsupportedLimit"; + + case cudaErrorDuplicateVariableName: + return "cudaErrorDuplicateVariableName"; + + case cudaErrorDuplicateTextureName: + return "cudaErrorDuplicateTextureName"; + + case cudaErrorDuplicateSurfaceName: + return "cudaErrorDuplicateSurfaceName"; + + case cudaErrorDevicesUnavailable: + return "cudaErrorDevicesUnavailable"; + + case cudaErrorInvalidKernelImage: + return "cudaErrorInvalidKernelImage"; + + case cudaErrorNoKernelImageForDevice: + return "cudaErrorNoKernelImageForDevice"; + + case cudaErrorIncompatibleDriverContext: + return "cudaErrorIncompatibleDriverContext"; + + case cudaErrorPeerAccessAlreadyEnabled: + return "cudaErrorPeerAccessAlreadyEnabled"; + + case cudaErrorPeerAccessNotEnabled: + return "cudaErrorPeerAccessNotEnabled"; + + case cudaErrorDeviceAlreadyInUse: + return "cudaErrorDeviceAlreadyInUse"; + + case cudaErrorProfilerDisabled: + return "cudaErrorProfilerDisabled"; + + case cudaErrorProfilerNotInitialized: + return "cudaErrorProfilerNotInitialized"; + + case cudaErrorProfilerAlreadyStarted: + return "cudaErrorProfilerAlreadyStarted"; + + case cudaErrorProfilerAlreadyStopped: + return "cudaErrorProfilerAlreadyStopped"; + + /* Since CUDA 4.0*/ + case cudaErrorAssert: + return "cudaErrorAssert"; + + case cudaErrorTooManyPeers: + return "cudaErrorTooManyPeers"; + + case cudaErrorHostMemoryAlreadyRegistered: + return "cudaErrorHostMemoryAlreadyRegistered"; + + case cudaErrorHostMemoryNotRegistered: + return "cudaErrorHostMemoryNotRegistered"; + + /* Since CUDA 5.0 */ + case cudaErrorOperatingSystem: + return "cudaErrorOperatingSystem"; + + case cudaErrorPeerAccessUnsupported: + return "cudaErrorPeerAccessUnsupported"; + + case cudaErrorLaunchMaxDepthExceeded: + return "cudaErrorLaunchMaxDepthExceeded"; + + case cudaErrorLaunchFileScopedTex: + return "cudaErrorLaunchFileScopedTex"; + + case cudaErrorLaunchFileScopedSurf: + return "cudaErrorLaunchFileScopedSurf"; + + case cudaErrorSyncDepthExceeded: + return "cudaErrorSyncDepthExceeded"; + + case cudaErrorLaunchPendingCountExceeded: + return "cudaErrorLaunchPendingCountExceeded"; + + case cudaErrorNotPermitted: + return "cudaErrorNotPermitted"; + + case cudaErrorNotSupported: + return "cudaErrorNotSupported"; + + /* Since CUDA 6.0 */ + case cudaErrorHardwareStackError: + return "cudaErrorHardwareStackError"; + + case cudaErrorIllegalInstruction: + return "cudaErrorIllegalInstruction"; + + case cudaErrorMisalignedAddress: + return "cudaErrorMisalignedAddress"; + + case cudaErrorInvalidAddressSpace: + return "cudaErrorInvalidAddressSpace"; + + case cudaErrorInvalidPc: + return "cudaErrorInvalidPc"; + + case cudaErrorIllegalAddress: + return "cudaErrorIllegalAddress"; + + /* Since CUDA 6.5*/ + case cudaErrorInvalidPtx: + return "cudaErrorInvalidPtx"; + + case cudaErrorInvalidGraphicsContext: + return "cudaErrorInvalidGraphicsContext"; + + case cudaErrorStartupFailure: + return "cudaErrorStartupFailure"; + + case cudaErrorApiFailureBase: + return "cudaErrorApiFailureBase"; + } + + return ""; +} +#endif + +#ifdef __cuda_cuda_h__ +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) +{ + switch (error) + { + case CUDA_SUCCESS: + return "CUDA_SUCCESS"; + + case CUDA_ERROR_INVALID_VALUE: + return "CUDA_ERROR_INVALID_VALUE"; + + case CUDA_ERROR_OUT_OF_MEMORY: + return "CUDA_ERROR_OUT_OF_MEMORY"; + + case CUDA_ERROR_NOT_INITIALIZED: + return "CUDA_ERROR_NOT_INITIALIZED"; + + case CUDA_ERROR_DEINITIALIZED: + return "CUDA_ERROR_DEINITIALIZED"; + + case CUDA_ERROR_PROFILER_DISABLED: + return "CUDA_ERROR_PROFILER_DISABLED"; + + case CUDA_ERROR_PROFILER_NOT_INITIALIZED: + return "CUDA_ERROR_PROFILER_NOT_INITIALIZED"; + + case CUDA_ERROR_PROFILER_ALREADY_STARTED: + return "CUDA_ERROR_PROFILER_ALREADY_STARTED"; + + case CUDA_ERROR_PROFILER_ALREADY_STOPPED: + return "CUDA_ERROR_PROFILER_ALREADY_STOPPED"; + + case CUDA_ERROR_NO_DEVICE: + return "CUDA_ERROR_NO_DEVICE"; + + case CUDA_ERROR_INVALID_DEVICE: + return "CUDA_ERROR_INVALID_DEVICE"; + + case CUDA_ERROR_INVALID_IMAGE: + return "CUDA_ERROR_INVALID_IMAGE"; + + case CUDA_ERROR_INVALID_CONTEXT: + return "CUDA_ERROR_INVALID_CONTEXT"; + + case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: + return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT"; + + case CUDA_ERROR_MAP_FAILED: + return "CUDA_ERROR_MAP_FAILED"; + + case CUDA_ERROR_UNMAP_FAILED: + return "CUDA_ERROR_UNMAP_FAILED"; + + case CUDA_ERROR_ARRAY_IS_MAPPED: + return "CUDA_ERROR_ARRAY_IS_MAPPED"; + + case CUDA_ERROR_ALREADY_MAPPED: + return "CUDA_ERROR_ALREADY_MAPPED"; + + case CUDA_ERROR_NO_BINARY_FOR_GPU: + return "CUDA_ERROR_NO_BINARY_FOR_GPU"; + + case CUDA_ERROR_ALREADY_ACQUIRED: + return "CUDA_ERROR_ALREADY_ACQUIRED"; + + case CUDA_ERROR_NOT_MAPPED: + return "CUDA_ERROR_NOT_MAPPED"; + + case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: + return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"; + + case CUDA_ERROR_NOT_MAPPED_AS_POINTER: + return "CUDA_ERROR_NOT_MAPPED_AS_POINTER"; + + case CUDA_ERROR_ECC_UNCORRECTABLE: + return "CUDA_ERROR_ECC_UNCORRECTABLE"; + + case CUDA_ERROR_UNSUPPORTED_LIMIT: + return "CUDA_ERROR_UNSUPPORTED_LIMIT"; + + case CUDA_ERROR_CONTEXT_ALREADY_IN_USE: + return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"; + + case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED: + return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"; + + case CUDA_ERROR_INVALID_PTX: + return "CUDA_ERROR_INVALID_PTX"; + + case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT: + return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"; + + case CUDA_ERROR_INVALID_SOURCE: + return "CUDA_ERROR_INVALID_SOURCE"; + + case CUDA_ERROR_FILE_NOT_FOUND: + return "CUDA_ERROR_FILE_NOT_FOUND"; + + case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: + return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"; + + case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: + return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"; + + case CUDA_ERROR_OPERATING_SYSTEM: + return "CUDA_ERROR_OPERATING_SYSTEM"; + + case CUDA_ERROR_INVALID_HANDLE: + return "CUDA_ERROR_INVALID_HANDLE"; + + case CUDA_ERROR_NOT_FOUND: + return "CUDA_ERROR_NOT_FOUND"; + + case CUDA_ERROR_NOT_READY: + return "CUDA_ERROR_NOT_READY"; + + case CUDA_ERROR_ILLEGAL_ADDRESS: + return "CUDA_ERROR_ILLEGAL_ADDRESS"; + + case CUDA_ERROR_LAUNCH_FAILED: + return "CUDA_ERROR_LAUNCH_FAILED"; + + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: + return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"; + + case CUDA_ERROR_LAUNCH_TIMEOUT: + return "CUDA_ERROR_LAUNCH_TIMEOUT"; + + case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: + return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"; + + case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"; + + case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED: + return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"; + + case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE: + return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"; + + case CUDA_ERROR_CONTEXT_IS_DESTROYED: + return "CUDA_ERROR_CONTEXT_IS_DESTROYED"; + + case CUDA_ERROR_ASSERT: + return "CUDA_ERROR_ASSERT"; + + case CUDA_ERROR_TOO_MANY_PEERS: + return "CUDA_ERROR_TOO_MANY_PEERS"; + + case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"; + + case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED: + return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"; + + case CUDA_ERROR_HARDWARE_STACK_ERROR: + return "CUDA_ERROR_HARDWARE_STACK_ERROR"; + + case CUDA_ERROR_ILLEGAL_INSTRUCTION: + return "CUDA_ERROR_ILLEGAL_INSTRUCTION"; + + case CUDA_ERROR_MISALIGNED_ADDRESS: + return "CUDA_ERROR_MISALIGNED_ADDRESS"; + + case CUDA_ERROR_INVALID_ADDRESS_SPACE: + return "CUDA_ERROR_INVALID_ADDRESS_SPACE"; + + case CUDA_ERROR_INVALID_PC: + return "CUDA_ERROR_INVALID_PC"; + + case CUDA_ERROR_NOT_PERMITTED: + return "CUDA_ERROR_NOT_PERMITTED"; + + case CUDA_ERROR_NOT_SUPPORTED: + return "CUDA_ERROR_NOT_SUPPORTED"; + + case CUDA_ERROR_UNKNOWN: + return "CUDA_ERROR_UNKNOWN"; + } + + return ""; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) +{ + switch (error) + { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) +{ + switch (error) + { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + } + + return ""; +} +#endif + + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) +{ + switch (error) + { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +//cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) +{ + switch(error) + { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED : + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; + +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(curandStatus_t error) +{ + switch (error) + { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) +{ + switch (error) + { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + + } + + return ""; +} +#endif + +#ifdef __DRIVER_TYPES_H__ +#ifndef DEVICE_RESET +#define DEVICE_RESET cudaDeviceReset(); +#endif +#else +#ifndef DEVICE_RESET +#define DEVICE_RESET +#endif +#endif + +template< typename T > +void check(T result, char const *const func, const char *const file, int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", + file, line, static_cast(result), _cudaGetErrorEnum(result), func); + DEVICE_RESET + // Make sure we call CUDA Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +#ifdef __DRIVER_TYPES_H__ +// This will output the proper CUDA error strings in the event that a CUDA host call returns an error +#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) +{ + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) + { + fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n", + file, line, errorMessage, (int)err, cudaGetErrorString(err)); + DEVICE_RESET + exit(EXIT_FAILURE); + } +} +#endif + +#ifndef MAX +#define MAX(a,b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) +{ + return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = + { + { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class + { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class + { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class + { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class + { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class + { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class + { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class + { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class + { -1, -1 } + }; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one to run properly + printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores); + return nGpuArchCoresPerSM[index-1].Cores; +} +// end of GPU Architecture definitions + +#ifdef __CUDA_RUNTIME_H__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) +{ + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) + { + fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) + { + devID = 0; + } + + if (devID > device_count-1) + { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count); + fprintf(stderr, ">> gpuDeviceInit (-device=%d) is not a valid GPU device. <<\n", devID); + fprintf(stderr, "\n"); + return -devID; + } + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + + if (deviceProp.computeMode == cudaComputeModeProhibited) + { + fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); + return -1; + } + + if (deviceProp.major < 1) + { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() +{ + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0, best_SM_arch = 0; + int devices_prohibited = 0; + + unsigned long long max_compute_perf = 0; + cudaDeviceProp deviceProp; + cudaGetDeviceCount(&device_count); + + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) + { + fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best major SM Architecture GPU device + while (current_device < device_count) + { + cudaGetDeviceProperties(&deviceProp, current_device); + + // If this GPU is not running on Compute Mode prohibited, then we can add it to the list + if (deviceProp.computeMode != cudaComputeModeProhibited) + { + if (deviceProp.major > 0 && deviceProp.major < 9999) + { + best_SM_arch = MAX(best_SM_arch, deviceProp.major); + } + } + else + { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) + { + fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error: all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) + { + cudaGetDeviceProperties(&deviceProp, current_device); + + // If this GPU is not running on Compute Mode prohibited, then we can add it to the list + if (deviceProp.computeMode != cudaComputeModeProhibited) + { + if (deviceProp.major == 9999 && deviceProp.minor == 9999) + { + sm_per_multiproc = 1; + } + else + { + sm_per_multiproc = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor); + } + + unsigned long long compute_perf = (unsigned long long) deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate; + + if (compute_perf > max_compute_perf) + { + // If we find GPU with SM major > 2, search only these + if (best_SM_arch > 2) + { + // If our device==dest_SM_arch, choose this, or else pass + if (deviceProp.major == best_SM_arch) + { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + else + { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + } + + ++current_device; + } + + return max_perf_device; +} + + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) +{ + cudaDeviceProp deviceProp; + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) + { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) + { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } + else + { + devID = gpuDeviceInit(devID); + + if (devID < 0) + { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } + else + { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + checkCudaErrors(cudaSetDevice(devID)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProp.name, deviceProp.major, deviceProp.minor); + } + + return devID; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) +{ + cudaDeviceProp deviceProp; + deviceProp.major = 0; + deviceProp.minor = 0; + int dev; + + checkCudaErrors(cudaGetDevice(&dev)); + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + + if ((deviceProp.major > major_version) || + (deviceProp.major == major_version && deviceProp.minor >= minor_version)) + { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, deviceProp.name, deviceProp.major, deviceProp.minor); + return true; + } + else + { + printf(" No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version); + return false; + } +} +#endif + +// end of CUDA Helper Functions + + +#endif diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_cuda_drvapi.h b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_drvapi.h new file mode 100644 index 000000000..8112ec9b1 --- /dev/null +++ b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_drvapi.h @@ -0,0 +1,517 @@ +/** + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// Helper functions for CUDA Driver API error handling (make sure that CUDA_H is included in your projects) +#ifndef HELPER_CUDA_DRVAPI_H +#define HELPER_CUDA_DRVAPI_H + +#include +#include +#include + +#include +#include + +#ifndef MAX +#define MAX(a,b) (a > b ? a : b) +#endif + +#ifndef HELPER_CUDA_H +inline int ftoi(float value) +{ + return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5)); +} +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions + +// add a level of protection to the CUDA SDK samples, let's force samples to explicitly include CUDA.H +#ifdef __cuda_cuda_h__ +// This will output the proper CUDA error strings in the event that a CUDA host call returns an error +#ifndef checkCudaErrors +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) + +// These are the inline versions for all of the SDK helper functions +inline void __checkCudaErrors(CUresult err, const char *file, const int line) +{ + if (CUDA_SUCCESS != err) + { + fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n", + err, getCudaDrvErrorString(err), file, line); + exit(EXIT_FAILURE); + } +} +#endif + +#ifdef getLastCudaDrvErrorMsg +#undef getLastCudaDrvErrorMsg +#endif + +#define getLastCudaDrvErrorMsg(msg) __getLastCudaDrvErrorMsg (msg, __FILE__, __LINE__) + +inline void __getLastCudaDrvErrorMsg(const char *msg, const char *file, const int line) +{ + CUresult err = cuCtxSynchronize(); + + if (CUDA_SUCCESS != err) + { + fprintf(stderr, "getLastCudaDrvErrorMsg -> %s", msg); + fprintf(stderr, "getLastCudaDrvErrorMsg -> cuCtxSynchronize API error = %04d \"%s\" in file <%s>, line %i.\n", + err, getCudaDrvErrorString(err), file, line); + exit(EXIT_FAILURE); + } +} + +// This function wraps the CUDA Driver API into a template function +template +inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device) +{ + CUresult error_result = cuDeviceGetAttribute(attribute, device_attribute, device); + + if (error_result != CUDA_SUCCESS) + { + printf("cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result)); + exit(EXIT_SUCCESS); + } +} +#endif + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2CoresDRV(int major, int minor) +{ + // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM + typedef struct + { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = + { + { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class + { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class + { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class + { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class + { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class + { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class + { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class + { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class + { -1, -1 } + }; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) + { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) + { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one to run properly + printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores); + return nGpuArchCoresPerSM[index-1].Cores; +} +// end of GPU Architecture definitions + +#ifdef __cuda_cuda_h__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) +{ + int cuDevice = 0; + int deviceCount = 0; + CUresult err = cuInit(0); + + if (CUDA_SUCCESS == err) + { + checkCudaErrors(cuDeviceGetCount(&deviceCount)); + } + + if (deviceCount == 0) + { + fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); + } + + int dev = 0; + dev = getCmdLineArgumentInt(ARGC, (const char **) ARGV, "device="); + + if (dev < 0) + { + dev = 0; + } + + if (dev > deviceCount-1) + { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount); + fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev); + fprintf(stderr, "\n"); + return -dev; + } + + checkCudaErrors(cuDeviceGet(&cuDevice, dev)); + char name[100]; + cuDeviceGetName(name, 100, cuDevice); + + int computeMode; + getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); + + if (computeMode == CU_COMPUTEMODE_PROHIBITED) + { + fprintf(stderr, "Error: device is running in , no threads can use this CUDA Device.\n"); + return -1; + } + + if (checkCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == false) + { + printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name); + } + + return dev; +} + +// This function returns the best GPU based on performance +inline int gpuGetMaxGflopsDeviceIdDRV() +{ + CUdevice current_device = 0; + CUdevice max_perf_device = 0; + int device_count = 0; + int sm_per_multiproc = 0; + unsigned long long max_compute_perf = 0; + int best_SM_arch = 0; + int major = 0; + int minor = 0; + int multiProcessorCount; + int clockRate; + int devices_prohibited = 0; + + cuInit(0); + checkCudaErrors(cuDeviceGetCount(&device_count)); + + if (device_count == 0) + { + fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); + } + + // Find the best major SM Architecture GPU device + while (current_device < device_count) + { + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); + + if (major > 0 && major < 9999) + { + best_SM_arch = MAX(best_SM_arch, major); + } + + current_device++; + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) + { + checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + current_device)); + checkCudaErrors(cuDeviceGetAttribute(&clockRate, + CU_DEVICE_ATTRIBUTE_CLOCK_RATE, + current_device)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); + + int computeMode; + getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device); + + if (computeMode != CU_COMPUTEMODE_PROHIBITED) + { + if (major == 9999 && minor == 9999) + { + sm_per_multiproc = 1; + } + else + { + sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor); + } + + unsigned long long compute_perf = (unsigned long long) (multiProcessorCount * sm_per_multiproc * clockRate); + + if (compute_perf > max_compute_perf) + { + // If we find GPU with SM major > 2, search only these + if (best_SM_arch > 2) + { + // If our device==dest_SM_arch, choose this, or else pass + if (major == best_SM_arch) + { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + else + { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + } + else + { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) + { + fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} + +// This function returns the best Graphics GPU based on performance +inline int gpuGetMaxGflopsGLDeviceIdDRV() +{ + CUdevice current_device = 0, max_perf_device = 0; + int device_count = 0, sm_per_multiproc = 0; + int max_compute_perf = 0, best_SM_arch = 0; + int major = 0, minor = 0, multiProcessorCount, clockRate; + int bTCC = 0; + int devices_prohibited = 0; + char deviceName[256]; + + cuInit(0); + checkCudaErrors(cuDeviceGetCount(&device_count)); + + if (device_count == 0) + { + fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: no devices supporting CUDA\n"); + exit(EXIT_FAILURE); + } + + // Find the best major SM Architecture GPU device that are graphics devices + while (current_device < device_count) + { + checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); + +#if CUDA_VERSION >= 3020 + checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device)); +#else + + // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1 + if (deviceName[0] == 'T') + { + bTCC = 1; + } + +#endif + + int computeMode; + getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device); + + if (computeMode != CU_COMPUTEMODE_PROHIBITED) + { + if (!bTCC) + { + if (major > 0 && major < 9999) + { + best_SM_arch = MAX(best_SM_arch, major); + } + } + } + else + { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) + { + fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) + { + checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + current_device)); + checkCudaErrors(cuDeviceGetAttribute(&clockRate, + CU_DEVICE_ATTRIBUTE_CLOCK_RATE, + current_device)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device)); + +#if CUDA_VERSION >= 3020 + checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device)); +#else + + // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1 + if (deviceName[0] == 'T') + { + bTCC = 1; + } + +#endif + + int computeMode; + getCudaAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device); + + if (computeMode != CU_COMPUTEMODE_PROHIBITED) + { + if (major == 9999 && minor == 9999) + { + sm_per_multiproc = 1; + } + else + { + sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor); + } + + // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor + if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this + { + int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) + { + // If we find GPU with SM major > 2, search only these + if (best_SM_arch > 2) + { + // If our device = dest_SM_arch, then we pick this one + if (major == best_SM_arch) + { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + else + { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } + } + } + + ++current_device; + } + + return max_perf_device; +} + +// General initialization call to pick the best CUDA Device +inline CUdevice findCudaDeviceDRV(int argc, const char **argv) +{ + CUdevice cuDevice; + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, (const char **)argv, "device")) + { + devID = gpuDeviceInitDRV(argc, argv); + + if (devID < 0) + { + printf("exiting...\n"); + exit(EXIT_SUCCESS); + } + } + else + { + // Otherwise pick the device with highest Gflops/s + char name[100]; + devID = gpuGetMaxGflopsDeviceIdDRV(); + checkCudaErrors(cuDeviceGet(&cuDevice, devID)); + cuDeviceGetName(name, 100, cuDevice); + printf("> Using CUDA Device [%d]: %s\n", devID, name); + } + + cuDeviceGet(&cuDevice, devID); + + return cuDevice; +} + +// This function will pick the best CUDA device available with OpenGL interop +inline CUdevice findCudaGLDeviceDRV(int argc, const char **argv) +{ + CUdevice cuDevice; + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, (const char **)argv, "device")) + { + devID = gpuDeviceInitDRV(argc, (const char **)argv); + + if (devID < 0) + { + printf("no CUDA capable devices found, exiting...\n"); + exit(EXIT_SUCCESS); + } + } + else + { + char name[100]; + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsGLDeviceIdDRV(); + checkCudaErrors(cuDeviceGet(&cuDevice, devID)); + cuDeviceGetName(name, 100, cuDevice); + printf("> Using CUDA/GL Device [%d]: %s\n", devID, name); + } + + return devID; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID) +{ + CUdevice cuDevice; + char name[256]; + int major = 0, minor = 0; + + checkCudaErrors(cuDeviceGet(&cuDevice, devID)); + checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID)); + + if ((major > major_version) || + (major == major_version && minor >= minor_version)) + { + printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor); + return true; + } + else + { + printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version); + return false; + } +} +#endif + +// end of CUDA Helper Functions + +#endif diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_cuda_gl.h b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_gl.h new file mode 100644 index 000000000..3d2d943a9 --- /dev/null +++ b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_gl.h @@ -0,0 +1,165 @@ +/** + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +#ifndef HELPER_CUDA_GL_H +#define HELPER_CUDA_GL_H + +#include +#include +#include + +// includes, graphics +#if defined (__APPLE__) || defined(MACOSX) +#include +#include +#else +#include +#include +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#ifdef __DRIVER_TYPES_H__ +#ifndef DEVICE_RESET +#define DEVICE_RESET cudaDeviceReset() +#endif +#else +#ifndef DEVICE_RESET +#define DEVICE_RESET +#endif +#endif + +#ifdef __CUDA_GL_INTEROP_H__ +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA OpenGL Helper functions + +inline int gpuGLDeviceInit(int ARGC, const char **ARGV) +{ + int deviceCount; + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); + + if (deviceCount == 0) + { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + int dev = 0; + dev = getCmdLineArgumentInt(ARGC, ARGV, "device="); + + if (dev < 0) + { + dev = 0; + } + + if (dev > deviceCount-1) + { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount); + fprintf(stderr, ">> gpuGLDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev); + fprintf(stderr, "\n"); + return -dev; + } + + cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev)); + + if (deviceProp.computeMode == cudaComputeModeProhibited) + { + fprintf(stderr, "Error: device is running in , no threads can use ::cudaSetDevice().\n"); + return -1; + } + + if (deviceProp.major < 1) + { + fprintf(stderr, "Error: device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (checkCmdLineFlag(ARGC, ARGV, "quiet") == false) + { + fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name); + } + + checkCudaErrors(cudaGLSetGLDevice(dev)); + return dev; +} + +// This function will pick the best CUDA device available with OpenGL interop +inline int findCudaGLDevice(int argc, const char **argv) +{ + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, (const char **)argv, "device")) + { + devID = gpuGLDeviceInit(argc, (const char **)argv); + + if (devID < 0) + { + printf("no CUDA capable devices found, exiting...\n"); + DEVICE_RESET + exit(EXIT_SUCCESS); + } + } + else + { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + cudaGLSetGLDevice(devID); + } + + return devID; +} + +//////////////////////////////////////////////////////////////////////////// +//! Check for OpenGL error +//! @return bool if no GL error has been encountered, otherwise 0 +//! @param file __FILE__ macro +//! @param line __LINE__ macro +//! @note The GL error is listed on stderr +//! @note This function should be used via the CHECK_ERROR_GL() macro +//////////////////////////////////////////////////////////////////////////// +inline bool +sdkCheckErrorGL(const char *file, const int line) +{ + bool ret_val = true; + + // check for error + GLenum gl_error = glGetError(); + + if (gl_error != GL_NO_ERROR) + { +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + char tmpStr[512]; + // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line + // when the user double clicks on the error line in the Output pane. Like any compile error. + sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", file, line, gluErrorString(gl_error)); + fprintf(stderr, "%s", tmpStr); +#endif + fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line); + fprintf(stderr, "%s\n", gluErrorString(gl_error)); + ret_val = false; + } + + return ret_val; +} + +#define SDK_CHECK_ERROR_GL() \ + if( false == sdkCheckErrorGL( __FILE__, __LINE__)) { \ + DEVICE_RESET \ + exit(EXIT_FAILURE); \ + } +#endif + +#endif diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_functions.h b/src/algorithms/tracking/libs/cudahelpers/helper_functions.h new file mode 100644 index 000000000..11538ba72 --- /dev/null +++ b/src/algorithms/tracking/libs/cudahelpers/helper_functions.h @@ -0,0 +1,42 @@ +/** + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// These are helper functions for the SDK samples (string parsing, timers, image helpers, etc) +#ifndef HELPER_FUNCTIONS_H +#define HELPER_FUNCTIONS_H + +#ifdef WIN32 +#pragma warning(disable:4996) +#endif + +// includes, project +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +// includes, timer, string parsing, image helpers +#include // helper functions for timers +#include // helper functions for string parsing +#include // helper functions for image compare, dump, data comparisons + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#endif // HELPER_FUNCTIONS_H diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_image.h b/src/algorithms/tracking/libs/cudahelpers/helper_image.h new file mode 100644 index 000000000..4e8b25cdf --- /dev/null +++ b/src/algorithms/tracking/libs/cudahelpers/helper_image.h @@ -0,0 +1,1110 @@ +/** + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// These are helper functions for the SDK samples (image,bitmap) +#ifndef HELPER_IMAGE_H +#define HELPER_IMAGE_H + +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifndef MIN +#define MIN(a,b) ((a < b) ? a : b) +#endif +#ifndef MAX +#define MAX(a,b) ((a > b) ? a : b) +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#include + +// namespace unnamed (internal) +namespace +{ + //! size of PGM file header + const unsigned int PGMHeaderSize = 0x40; + + // types + + //! Data converter from unsigned char / unsigned byte to type T + template + struct ConverterFromUByte; + + //! Data converter from unsigned char / unsigned byte + template<> + struct ConverterFromUByte + { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) + { + return static_cast(val); + } + }; + + //! Data converter from unsigned char / unsigned byte to float + template<> + struct ConverterFromUByte + { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) + { + return static_cast(val) / 255.0f; + } + }; + + //! Data converter from unsigned char / unsigned byte to type T + template + struct ConverterToUByte; + + //! Data converter from unsigned char / unsigned byte to unsigned int + template<> + struct ConverterToUByte + { + //! Conversion operator (essentially a passthru + //! @return converted value + //! @param val value to convert + unsigned char operator()(const unsigned char &val) + { + return val; + } + }; + + //! Data converter from unsigned char / unsigned byte to unsigned int + template<> + struct ConverterToUByte + { + //! Conversion operator + //! @return converted value + //! @param val value to convert + unsigned char operator()(const float &val) + { + return static_cast(val * 255.0f); + } + }; +} + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef FOPEN +#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#else +#ifndef FOPEN +#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#endif + +inline bool +__loadPPM(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h, unsigned int *channels) +{ + FILE *fp = NULL; + + if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) + { + std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; + return false; + } + + // check header + char header[PGMHeaderSize]; + + if (fgets(header, PGMHeaderSize, fp) == NULL) + { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; + return false; + } + + if (strncmp(header, "P5", 2) == 0) + { + *channels = 1; + } + else if (strncmp(header, "P6", 2) == 0) + { + *channels = 3; + } + else + { + std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; + *channels = 0; + return false; + } + + // parse header, read maxval, width and height + unsigned int width = 0; + unsigned int height = 0; + unsigned int maxval = 0; + unsigned int i = 0; + + while (i < 3) + { + if (fgets(header, PGMHeaderSize, fp) == NULL) + { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; + return false; + } + + if (header[0] == '#') + { + continue; + } + + if (i == 0) + { + i += SSCANF(header, "%u %u %u", &width, &height, &maxval); + } + else if (i == 1) + { + i += SSCANF(header, "%u %u", &height, &maxval); + } + else if (i == 2) + { + i += SSCANF(header, "%u", &maxval); + } + } + + // check if given handle for the data is initialized + if (NULL != *data) + { + if (*w != width || *h != height) + { + std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; + } + } + else + { + *data = (unsigned char *) malloc(sizeof(unsigned char) * width * height **channels); + *w = width; + *h = height; + } + + // read and close file + if (fread(*data, sizeof(unsigned char), width * height **channels, fp) == 0) + { + std::cerr << "__LoadPPM() read data returned error." << std::endl; + } + + fclose(fp); + + return true; +} + +template +inline bool +sdkLoadPGM(const char *file, T **data, unsigned int *w, unsigned int *h) +{ + unsigned char *idata = NULL; + unsigned int channels; + + if (true != __loadPPM(file, &idata, w, h, &channels)) + { + return false; + } + + unsigned int size = *w **h * channels; + + // initialize mem if necessary + // the correct size is checked / set in loadPGMc() + if (NULL == *data) + { + *data = (T *) malloc(sizeof(T) * size); + } + + // copy and cast data + std::transform(idata, idata + size, *data, ConverterFromUByte()); + + free(idata); + + return true; +} + +template +inline bool +sdkLoadPPM4(const char *file, T **data, + unsigned int *w,unsigned int *h) +{ + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) + { + // pad 4th component + int size = *w **h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = (T *) malloc(sizeof(T) * size * 4); + unsigned char *ptr = *data; + + for (int i=0; i 0); + assert(h > 0); + + std::fstream fh(file, std::fstream::out | std::fstream::binary); + + if (fh.bad()) + { + std::cerr << "__savePPM() : Opening file failed." << std::endl; + return false; + } + + if (channels == 1) + { + fh << "P5\n"; + } + else if (channels == 3) + { + fh << "P6\n"; + } + else + { + std::cerr << "__savePPM() : Invalid number of channels." << std::endl; + return false; + } + + fh << w << "\n" << h << "\n" << 0xff << std::endl; + + for (unsigned int i = 0; (i < (w*h*channels)) && fh.good(); ++i) + { + fh << data[i]; + } + + fh.flush(); + + if (fh.bad()) + { + std::cerr << "__savePPM() : Writing data failed." << std::endl; + return false; + } + + fh.close(); + + return true; +} + +template +inline bool +sdkSavePGM(const char *file, T *data, unsigned int w, unsigned int h) +{ + unsigned int size = w * h; + unsigned char *idata = + (unsigned char *) malloc(sizeof(unsigned char) * size); + + std::transform(data, data + size, idata, ConverterToUByte()); + + // write file + bool result = __savePPM(file, idata, w, h, 1); + + // cleanup + free(idata); + + return result; +} + +inline bool +sdkSavePPM4ub(const char *file, unsigned char *data, + unsigned int w, unsigned int h) +{ + // strip 4th component + int size = w * h; + unsigned char *ndata = (unsigned char *) malloc(sizeof(unsigned char) * size*3); + unsigned char *ptr = ndata; + + for (int i=0; i +inline bool +sdkReadFile(const char *filename, T **data, unsigned int *len, bool verbose) +{ + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // intermediate storage for the data read + std::vector data_read; + + // open file for reading + FILE *fh = NULL; + + // check if filestream is valid + if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) + { + printf("Unable to open input file: %s\n", filename); + return false; + } + + // read all data elements + T token; + + while (!feof(fh)) + { + fscanf(fh, "%f", &token); + data_read.push_back(token); + } + + // the last element is read twice + data_read.pop_back(); + fclose(fh); + + // check if the given handle is already initialized + if (NULL != *data) + { + if (*len != data_read.size()) + { + std::cerr << "sdkReadFile() : Initialized memory given but " + << "size mismatch with signal read " + << "(data read / data init = " << (unsigned int)data_read.size() + << " / " << *len << ")" << std::endl; + + return false; + } + } + else + { + // allocate storage for the data read + *data = (T *) malloc(sizeof(T) * data_read.size()); + // store signal size + *len = static_cast(data_read.size()); + } + + // copy data + memcpy(*data, &data_read.front(), sizeof(T) * data_read.size()); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool +sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, unsigned int block_num, unsigned int block_size, bool verbose) +{ + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // open file for reading + FILE *fh = fopen(filename, "rb"); + + if (fh == NULL && verbose) + { + std::cerr << "sdkReadFile() : Opening file failed." << std::endl; + return false; + } + + // check if the given handle is already initialized + // allocate storage for the data read + data[block_num] = (T *) malloc(block_size); + + // read all data elements + fseek(fh, block_num * block_size, SEEK_SET); + *len = fread(data[block_num], sizeof(T), block_size/sizeof(T), fh); + + fclose(fh); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename +//! @return true if writing the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool +sdkWriteFile(const char *filename, const T *data, unsigned int len, + const S epsilon, bool verbose, bool append = false) +{ + assert(NULL != filename); + assert(NULL != data); + + // open file for writing + // if (append) { + std::fstream fh(filename, std::fstream::out | std::fstream::ate); + + if (verbose) + { + std::cerr << "sdkWriteFile() : Open file " << filename << " for write/append." << std::endl; + } + + /* } else { + std::fstream fh(filename, std::fstream::out); + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename << " for write." << std::endl; + } + } + */ + + // check if filestream is valid + if (! fh.good()) + { + if (verbose) + { + std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; + } + + return false; + } + + // first write epsilon + fh << "# " << epsilon << "\n"; + + // write data + for (unsigned int i = 0; (i < len) && (fh.good()); ++i) + { + fh << data[i] << ' '; + } + + // Check if writing succeeded + if (! fh.good()) + { + if (verbose) + { + std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; + } + + return false; + } + + // file ends with nl + fh << std::endl; + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference timer_interface to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool +compareData(const T *reference, const T *data, const unsigned int len, + const S epsilon, const float threshold) +{ + assert(epsilon >= 0); + + bool result = true; + unsigned int error_count = 0; + + for (unsigned int i = 0; i < len; ++i) + { + float diff = (float)reference[i] - (float)data[i]; + bool comp = (diff <= epsilon) && (diff >= -epsilon); + result &= comp; + + error_count += !comp; + +#if 0 + + if (! comp) + { + std::cerr << "ERROR, i = " << i << ",\t " + << reference[i] << " / " + << data[i] + << " (reference / data)\n"; + } + +#endif + } + + if (threshold == 0.0f) + { + return (result) ? true : false; + } + else + { + if (error_count) + { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count); + } + + return (len*threshold > error_count) ? true : false; + } +} + +#ifndef __MIN_EPSILON_ERROR +#define __MIN_EPSILON_ERROR 1e-3f +#endif + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param epsilon threshold % of (# of bytes) for pass/fail +////////////////////////////////////////////////////////////////////////////// +template +inline bool +compareDataAsFloatThreshold(const T *reference, const T *data, const unsigned int len, + const S epsilon, const float threshold) +{ + assert(epsilon >= 0); + + // If we set epsilon to be 0, let's set a minimum threshold + float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); + int error_count = 0; + bool result = true; + + for (unsigned int i = 0; i < len; ++i) + { + float diff = fabs((float)reference[i] - (float)data[i]); + bool comp = (diff < max_error); + result &= comp; + + if (! comp) + { + error_count++; +#if 0 + + if (error_count < 50) + { + printf("\n ERROR(epsilon=%4.3f), i=%d, (ref)0x%02x / (data)0x%02x / (diff)%d\n", + max_error, i, + *(unsigned int *)&reference[i], + *(unsigned int *)&data[i], + (unsigned int)diff); + } + +#endif + } + } + + if (threshold == 0.0f) + { + if (error_count) + { + printf("total # of errors = %d\n", error_count); + } + + return (error_count == 0) ? true : false; + } + else + { + if (error_count) + { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", (float)error_count*100/(float)len, error_count); + } + + return ((len*threshold > error_count) ? true : false); + } +} + +inline +void sdkDumpBin(void *data, unsigned int bytes, const char *filename) +{ + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); +} + +inline +bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path) +{ + unsigned int *src_buffer, *ref_buffer; + FILE *src_fp = NULL, *ref_fp = NULL; + + unsigned long error_count = 0; + size_t fsize = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) + { + printf("compareBin2Bin unable to open src_file: %s\n", src_file); + error_count++; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) + { + printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) + { + fclose(src_fp); + } + + if (ref_fp) + { + fclose(ref_fp); + } + } + else + { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) + { + printf("compareBin2Bin unable to open ref_file: %s\n", ref_file_path); + error_count++; + } + + if (src_fp && ref_fp) + { + src_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); + ref_buffer = (unsigned int *)malloc(nelements*sizeof(unsigned int)); + + fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); + fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); + + printf("> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize); + + if (!compareData(ref_buffer, src_buffer, nelements, epsilon, threshold)) + { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } + else + { + if (src_fp) + { + fclose(src_fp); + } + + if (ref_fp) + { + fclose(ref_fp); + } + } + } + + if (error_count == 0) + { + printf(" OK\n"); + } + else + { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline +bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, unsigned int nelements, const float epsilon, const float threshold, char *exec_path) +{ + float *src_buffer, *ref_buffer; + FILE *src_fp = NULL, *ref_fp = NULL; + size_t fsize = 0; + + unsigned long error_count = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) + { + printf("compareBin2Bin unable to open src_file: %s\n", src_file); + error_count = 1; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) + { + printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", exec_path); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) + { + fclose(src_fp); + } + + if (ref_fp) + { + fclose(ref_fp); + } + } + else + { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) + { + printf("compareBin2Bin unable to open ref_file: %s\n", ref_file_path); + error_count = 1; + } + + if (src_fp && ref_fp) + { + src_buffer = (float *)malloc(nelements*sizeof(float)); + ref_buffer = (float *)malloc(nelements*sizeof(float)); + + fsize = fread(src_buffer, nelements, sizeof(float), src_fp); + fsize = fread(ref_buffer, nelements, sizeof(float), ref_fp); + + printf("> compareBin2Bin nelements=%d, epsilon=%4.2f, threshold=%4.2f\n", nelements, epsilon, threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, (int)fsize); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, (int)fsize); + + if (!compareDataAsFloatThreshold(ref_buffer, src_buffer, nelements, epsilon, threshold)) + { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } + else + { + if (src_fp) + { + fclose(src_fp); + } + + if (ref_fp) + { + fclose(ref_fp); + } + } + } + + if (error_count == 0) + { + printf(" OK\n"); + } + else + { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool +sdkCompareL2fe(const float *reference, const float *data, + const unsigned int len, const float epsilon) +{ + assert(epsilon >= 0); + + float error = 0; + float ref = 0; + + for (unsigned int i = 0; i < len; ++i) + { + + float diff = reference[i] - data[i]; + error += diff * diff; + ref += reference[i] * reference[i]; + } + + float normRef = sqrtf(ref); + + if (fabs(ref) < 1e-7) + { +#ifdef _DEBUG + std::cerr << "ERROR, reference l2-norm is 0\n"; +#endif + return false; + } + + float normError = sqrtf(error); + error = normError / normRef; + bool result = error < epsilon; +#ifdef _DEBUG + + if (! result) + { + std::cerr << "ERROR, l2-norm error " + << error << " is greater than epsilon " << epsilon << "\n"; + } + +#endif + + return result; +} + +inline bool +sdkLoadPPMub(const char *file, unsigned char **data, + unsigned int *w,unsigned int *h) +{ + unsigned int channels; + return __loadPPM(file, data, w, h, &channels); +} + +inline bool +sdkLoadPPM4ub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) +{ + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) + { + // pad 4th component + int size = *w **h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = (unsigned char *) malloc(sizeof(unsigned char) * size * 4); + unsigned char *ptr = *data; + + for (int i=0; i Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + + if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) + { + if (verboseErrors) + { + std::cerr << "PPMvsPPM: unable to load ref image file: "<< ref_file << "\n"; + } + + return false; + } + + if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) + { + std::cerr << "PPMvsPPM: unable to load src image file: " << src_file << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) + { + if (verboseErrors) std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width << + "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n"; + } + + if (verboseErrors) std::cerr << "PPMvsPPM: comparing images size (" << src_width << + "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n"; + + if (compareData(ref_data, src_data, src_width*src_height*4, epsilon, threshold) == false) + { + error_count=1; + } + + if (error_count == 0) + { + if (verboseErrors) + { + std::cerr << " OK\n\n"; + } + } + else + { + if (verboseErrors) + { + std::cerr << " FAILURE! "< Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + + if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) + { + if (verboseErrors) + { + std::cerr << "PGMvsPGM: unable to load ref image file: "<< ref_file << "\n"; + } + + return false; + } + + if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) + { + std::cerr << "PGMvsPGM: unable to load src image file: " << src_file << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) + { + if (verboseErrors) std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width << + "," << src_height << ")vs(" << ref_width << "," << ref_height << ")\n"; + } + + if (verboseErrors) std::cerr << "PGMvsPGM: comparing images size (" << src_width << + "," << src_height << ") epsilon(" << epsilon << "), threshold(" << threshold*100 << "%)\n"; + + if (compareData(ref_data, src_data, src_width*src_height, epsilon, threshold) == false) + { + error_count=1; + } + + if (error_count == 0) + { + if (verboseErrors) + { + std::cerr << " OK\n\n"; + } + } + else + { + if (verboseErrors) + { + std::cerr << " FAILURE! "< + +//////////////////////////////////////////////////////////////////////////////// +// host implementations of CUDA functions +//////////////////////////////////////////////////////////////////////////////// + +inline float fminf(float a, float b) +{ + return a < b ? a : b; +} + +inline float fmaxf(float a, float b) +{ + return a > b ? a : b; +} + +inline int max(int a, int b) +{ + return a > b ? a : b; +} + +inline int min(int a, int b) +{ + return a < b ? a : b; +} + +inline float rsqrtf(float x) +{ + return 1.0f / sqrtf(x); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// constructors +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 make_float2(float s) +{ + return make_float2(s, s); +} +inline __host__ __device__ float2 make_float2(float3 a) +{ + return make_float2(a.x, a.y); +} +inline __host__ __device__ float2 make_float2(int2 a) +{ + return make_float2(float(a.x), float(a.y)); +} +inline __host__ __device__ float2 make_float2(uint2 a) +{ + return make_float2(float(a.x), float(a.y)); +} + +inline __host__ __device__ int2 make_int2(int s) +{ + return make_int2(s, s); +} +inline __host__ __device__ int2 make_int2(int3 a) +{ + return make_int2(a.x, a.y); +} +inline __host__ __device__ int2 make_int2(uint2 a) +{ + return make_int2(int(a.x), int(a.y)); +} +inline __host__ __device__ int2 make_int2(float2 a) +{ + return make_int2(int(a.x), int(a.y)); +} + +inline __host__ __device__ uint2 make_uint2(uint s) +{ + return make_uint2(s, s); +} +inline __host__ __device__ uint2 make_uint2(uint3 a) +{ + return make_uint2(a.x, a.y); +} +inline __host__ __device__ uint2 make_uint2(int2 a) +{ + return make_uint2(uint(a.x), uint(a.y)); +} + +inline __host__ __device__ float3 make_float3(float s) +{ + return make_float3(s, s, s); +} +inline __host__ __device__ float3 make_float3(float2 a) +{ + return make_float3(a.x, a.y, 0.0f); +} +inline __host__ __device__ float3 make_float3(float2 a, float s) +{ + return make_float3(a.x, a.y, s); +} +inline __host__ __device__ float3 make_float3(float4 a) +{ + return make_float3(a.x, a.y, a.z); +} +inline __host__ __device__ float3 make_float3(int3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} +inline __host__ __device__ float3 make_float3(uint3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} + +inline __host__ __device__ int3 make_int3(int s) +{ + return make_int3(s, s, s); +} +inline __host__ __device__ int3 make_int3(int2 a) +{ + return make_int3(a.x, a.y, 0); +} +inline __host__ __device__ int3 make_int3(int2 a, int s) +{ + return make_int3(a.x, a.y, s); +} +inline __host__ __device__ int3 make_int3(uint3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} +inline __host__ __device__ int3 make_int3(float3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} + +inline __host__ __device__ uint3 make_uint3(uint s) +{ + return make_uint3(s, s, s); +} +inline __host__ __device__ uint3 make_uint3(uint2 a) +{ + return make_uint3(a.x, a.y, 0); +} +inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) +{ + return make_uint3(a.x, a.y, s); +} +inline __host__ __device__ uint3 make_uint3(uint4 a) +{ + return make_uint3(a.x, a.y, a.z); +} +inline __host__ __device__ uint3 make_uint3(int3 a) +{ + return make_uint3(uint(a.x), uint(a.y), uint(a.z)); +} + +inline __host__ __device__ float4 make_float4(float s) +{ + return make_float4(s, s, s, s); +} +inline __host__ __device__ float4 make_float4(float3 a) +{ + return make_float4(a.x, a.y, a.z, 0.0f); +} +inline __host__ __device__ float4 make_float4(float3 a, float w) +{ + return make_float4(a.x, a.y, a.z, w); +} +inline __host__ __device__ float4 make_float4(int4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} +inline __host__ __device__ float4 make_float4(uint4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} + +inline __host__ __device__ int4 make_int4(int s) +{ + return make_int4(s, s, s, s); +} +inline __host__ __device__ int4 make_int4(int3 a) +{ + return make_int4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ int4 make_int4(int3 a, int w) +{ + return make_int4(a.x, a.y, a.z, w); +} +inline __host__ __device__ int4 make_int4(uint4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} +inline __host__ __device__ int4 make_int4(float4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} + + +inline __host__ __device__ uint4 make_uint4(uint s) +{ + return make_uint4(s, s, s, s); +} +inline __host__ __device__ uint4 make_uint4(uint3 a) +{ + return make_uint4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) +{ + return make_uint4(a.x, a.y, a.z, w); +} +inline __host__ __device__ uint4 make_uint4(int4 a) +{ + return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// negate +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 &a) +{ + return make_float2(-a.x, -a.y); +} +inline __host__ __device__ int2 operator-(int2 &a) +{ + return make_int2(-a.x, -a.y); +} +inline __host__ __device__ float3 operator-(float3 &a) +{ + return make_float3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ int3 operator-(int3 &a) +{ + return make_int3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ float4 operator-(float4 &a) +{ + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +inline __host__ __device__ int4 operator-(int4 &a) +{ + return make_int4(-a.x, -a.y, -a.z, -a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// addition +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator+(float2 a, float2 b) +{ + return make_float2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(float2 &a, float2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ float2 operator+(float2 a, float b) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ float2 operator+(float b, float2 a) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(float2 &a, float b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ int2 operator+(int2 a, int2 b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(int2 &a, int2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ int2 operator+(int2 a, int b) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ int2 operator+(int b, int2 a) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(int2 &a, int b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) +{ + return make_uint2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(uint2 &a, uint2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ uint2 operator+(uint2 a, uint b) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ uint2 operator+(uint b, uint2 a) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(uint2 &a, uint b) +{ + a.x += b; + a.y += b; +} + + +inline __host__ __device__ float3 operator+(float3 a, float3 b) +{ + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(float3 &a, float3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ float3 operator+(float3 a, float b) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(float3 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int3 a, int3 b) +{ + return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(int3 &a, int3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ int3 operator+(int3 a, int b) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(int3 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) +{ + return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(uint3 &a, uint3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ uint3 operator+(uint3 a, uint b) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(uint3 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int b, int3 a) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ uint3 operator+(uint b, uint3 a) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ float3 operator+(float b, float3 a) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} + +inline __host__ __device__ float4 operator+(float4 a, float4 b) +{ + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(float4 &a, float4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ float4 operator+(float4 a, float b) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ float4 operator+(float b, float4 a) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(float4 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ int4 operator+(int4 a, int4 b) +{ + return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(int4 &a, int4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ int4 operator+(int4 a, int b) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ int4 operator+(int b, int4 a) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(int4 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) +{ + return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(uint4 &a, uint4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ uint4 operator+(uint4 a, uint b) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ uint4 operator+(uint b, uint4 a) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(uint4 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +//////////////////////////////////////////////////////////////////////////////// +// subtract +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 a, float2 b) +{ + return make_float2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(float2 &a, float2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ float2 operator-(float2 a, float b) +{ + return make_float2(a.x - b, a.y - b); +} +inline __host__ __device__ float2 operator-(float b, float2 a) +{ + return make_float2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(float2 &a, float b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ int2 operator-(int2 a, int2 b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(int2 &a, int2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ int2 operator-(int2 a, int b) +{ + return make_int2(a.x - b, a.y - b); +} +inline __host__ __device__ int2 operator-(int b, int2 a) +{ + return make_int2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(int2 &a, int b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) +{ + return make_uint2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ uint2 operator-(uint2 a, uint b) +{ + return make_uint2(a.x - b, a.y - b); +} +inline __host__ __device__ uint2 operator-(uint b, uint2 a) +{ + return make_uint2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ float3 operator-(float3 a, float3 b) +{ + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(float3 &a, float3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ float3 operator-(float3 a, float b) +{ + return make_float3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ float3 operator-(float b, float3 a) +{ + return make_float3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(float3 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ int3 operator-(int3 a, int3 b) +{ + return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(int3 &a, int3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ int3 operator-(int3 a, int b) +{ + return make_int3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ int3 operator-(int b, int3 a) +{ + return make_int3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(int3 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) +{ + return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ uint3 operator-(uint3 a, uint b) +{ + return make_uint3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ uint3 operator-(uint b, uint3 a) +{ + return make_uint3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ float4 operator-(float4 a, float4 b) +{ + return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(float4 &a, float4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ float4 operator-(float4 a, float b) +{ + return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ void operator-=(float4 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ int4 operator-(int4 a, int4 b) +{ + return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(int4 &a, int4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ int4 operator-(int4 a, int b) +{ + return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ int4 operator-(int b, int4 a) +{ + return make_int4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(int4 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) +{ + return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ uint4 operator-(uint4 a, uint b) +{ + return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ uint4 operator-(uint b, uint4 a) +{ + return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// multiply +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator*(float2 a, float2 b) +{ + return make_float2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(float2 &a, float2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ float2 operator*(float2 a, float b) +{ + return make_float2(a.x * b, a.y * b); +} +inline __host__ __device__ float2 operator*(float b, float2 a) +{ + return make_float2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(float2 &a, float b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ int2 operator*(int2 a, int2 b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(int2 &a, int2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ int2 operator*(int2 a, int b) +{ + return make_int2(a.x * b, a.y * b); +} +inline __host__ __device__ int2 operator*(int b, int2 a) +{ + return make_int2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(int2 &a, int b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) +{ + return make_uint2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ uint2 operator*(uint2 a, uint b) +{ + return make_uint2(a.x * b, a.y * b); +} +inline __host__ __device__ uint2 operator*(uint b, uint2 a) +{ + return make_uint2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ float3 operator*(float3 a, float3 b) +{ + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(float3 &a, float3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ float3 operator*(float3 a, float b) +{ + return make_float3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ float3 operator*(float b, float3 a) +{ + return make_float3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(float3 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ int3 operator*(int3 a, int3 b) +{ + return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(int3 &a, int3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ int3 operator*(int3 a, int b) +{ + return make_int3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ int3 operator*(int b, int3 a) +{ + return make_int3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(int3 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) +{ + return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ uint3 operator*(uint3 a, uint b) +{ + return make_uint3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ uint3 operator*(uint b, uint3 a) +{ + return make_uint3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ float4 operator*(float4 a, float4 b) +{ + return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(float4 &a, float4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ float4 operator*(float4 a, float b) +{ + return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ float4 operator*(float b, float4 a) +{ + return make_float4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(float4 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ int4 operator*(int4 a, int4 b) +{ + return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(int4 &a, int4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ int4 operator*(int4 a, int b) +{ + return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ int4 operator*(int b, int4 a) +{ + return make_int4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(int4 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) +{ + return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ uint4 operator*(uint4 a, uint b) +{ + return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ uint4 operator*(uint b, uint4 a) +{ + return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// divide +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator/(float2 a, float2 b) +{ + return make_float2(a.x / b.x, a.y / b.y); +} +inline __host__ __device__ void operator/=(float2 &a, float2 b) +{ + a.x /= b.x; + a.y /= b.y; +} +inline __host__ __device__ float2 operator/(float2 a, float b) +{ + return make_float2(a.x / b, a.y / b); +} +inline __host__ __device__ void operator/=(float2 &a, float b) +{ + a.x /= b; + a.y /= b; +} +inline __host__ __device__ float2 operator/(float b, float2 a) +{ + return make_float2(b / a.x, b / a.y); +} + +inline __host__ __device__ float3 operator/(float3 a, float3 b) +{ + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ void operator/=(float3 &a, float3 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; +} +inline __host__ __device__ float3 operator/(float3 a, float b) +{ + return make_float3(a.x / b, a.y / b, a.z / b); +} +inline __host__ __device__ void operator/=(float3 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; +} +inline __host__ __device__ float3 operator/(float b, float3 a) +{ + return make_float3(b / a.x, b / a.y, b / a.z); +} + +inline __host__ __device__ float4 operator/(float4 a, float4 b) +{ + return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} +inline __host__ __device__ void operator/=(float4 &a, float4 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; + a.w /= b.w; +} +inline __host__ __device__ float4 operator/(float4 a, float b) +{ + return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); +} +inline __host__ __device__ void operator/=(float4 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; + a.w /= b; +} +inline __host__ __device__ float4 operator/(float b, float4 a) +{ + return make_float4(b / a.x, b / a.y, b / a.z, b / a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// min +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fminf(float2 a, float2 b) +{ + return make_float2(fminf(a.x,b.x), fminf(a.y,b.y)); +} +inline __host__ __device__ float3 fminf(float3 a, float3 b) +{ + return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z)); +} +inline __host__ __device__ float4 fminf(float4 a, float4 b) +{ + return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w)); +} + +inline __host__ __device__ int2 min(int2 a, int2 b) +{ + return make_int2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ int3 min(int3 a, int3 b) +{ + return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ int4 min(int4 a, int4 b) +{ + return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +inline __host__ __device__ uint2 min(uint2 a, uint2 b) +{ + return make_uint2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ uint3 min(uint3 a, uint3 b) +{ + return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ uint4 min(uint4 a, uint4 b) +{ + return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// max +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmaxf(float2 a, float2 b) +{ + return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y)); +} +inline __host__ __device__ float3 fmaxf(float3 a, float3 b) +{ + return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z)); +} +inline __host__ __device__ float4 fmaxf(float4 a, float4 b) +{ + return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w)); +} + +inline __host__ __device__ int2 max(int2 a, int2 b) +{ + return make_int2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ int3 max(int3 a, int3 b) +{ + return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ int4 max(int4 a, int4 b) +{ + return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +inline __host__ __device__ uint2 max(uint2 a, uint2 b) +{ + return make_uint2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ uint3 max(uint3 a, uint3 b) +{ + return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ uint4 max(uint4 a, uint4 b) +{ + return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// lerp +// - linear interpolation between a and b, based on value t in [0, 1] range +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float lerp(float a, float b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) +{ + return a + t*(b-a); +} + +//////////////////////////////////////////////////////////////////////////////// +// clamp +// - clamp the value v to be in the range [a, b] +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float clamp(float f, float a, float b) +{ + return fmaxf(a, fminf(f, b)); +} +inline __device__ __host__ int clamp(int f, int a, int b) +{ + return max(a, min(f, b)); +} +inline __device__ __host__ uint clamp(uint f, uint a, uint b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ float2 clamp(float2 v, float a, float b) +{ + return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) +{ + return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ float3 clamp(float3 v, float a, float b) +{ + return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) +{ + return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ float4 clamp(float4 v, float a, float b) +{ + return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) +{ + return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ int2 clamp(int2 v, int a, int b) +{ + return make_int2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) +{ + return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ int3 clamp(int3 v, int a, int b) +{ + return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) +{ + return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ int4 clamp(int4 v, int a, int b) +{ + return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) +{ + return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) +{ + return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) +{ + return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) +{ + return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) +{ + return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) +{ + return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) +{ + return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// dot product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float dot(float2 a, float2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ float dot(float3 a, float3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ float dot(float4 a, float4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ int dot(int2 a, int2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ int dot(int3 a, int3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ int dot(int4 a, int4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ uint dot(uint2 a, uint2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ uint dot(uint3 a, uint3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ uint dot(uint4 a, uint4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +//////////////////////////////////////////////////////////////////////////////// +// length +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float length(float2 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float3 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float4 v) +{ + return sqrtf(dot(v, v)); +} + +//////////////////////////////////////////////////////////////////////////////// +// normalize +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 normalize(float2 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float3 normalize(float3 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float4 normalize(float4 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} + +//////////////////////////////////////////////////////////////////////////////// +// floor +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 floorf(float2 v) +{ + return make_float2(floorf(v.x), floorf(v.y)); +} +inline __host__ __device__ float3 floorf(float3 v) +{ + return make_float3(floorf(v.x), floorf(v.y), floorf(v.z)); +} +inline __host__ __device__ float4 floorf(float4 v) +{ + return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// frac - returns the fractional portion of a scalar or each vector component +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float fracf(float v) +{ + return v - floorf(v); +} +inline __host__ __device__ float2 fracf(float2 v) +{ + return make_float2(fracf(v.x), fracf(v.y)); +} +inline __host__ __device__ float3 fracf(float3 v) +{ + return make_float3(fracf(v.x), fracf(v.y), fracf(v.z)); +} +inline __host__ __device__ float4 fracf(float4 v) +{ + return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// fmod +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmodf(float2 a, float2 b) +{ + return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y)); +} +inline __host__ __device__ float3 fmodf(float3 a, float3 b) +{ + return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z)); +} +inline __host__ __device__ float4 fmodf(float4 a, float4 b) +{ + return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// absolute value +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fabs(float2 v) +{ + return make_float2(fabs(v.x), fabs(v.y)); +} +inline __host__ __device__ float3 fabs(float3 v) +{ + return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); +} +inline __host__ __device__ float4 fabs(float4 v) +{ + return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); +} + +inline __host__ __device__ int2 abs(int2 v) +{ + return make_int2(abs(v.x), abs(v.y)); +} +inline __host__ __device__ int3 abs(int3 v) +{ + return make_int3(abs(v.x), abs(v.y), abs(v.z)); +} +inline __host__ __device__ int4 abs(int4 v) +{ + return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// reflect +// - returns reflection of incident ray I around surface normal N +// - N should be normalized, reflected vector's length is equal to length of I +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 reflect(float3 i, float3 n) +{ + return i - 2.0f * n * dot(n,i); +} + +//////////////////////////////////////////////////////////////////////////////// +// cross product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 cross(float3 a, float3 b) +{ + return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); +} + +//////////////////////////////////////////////////////////////////////////////// +// smoothstep +// - returns 0 if x < a +// - returns 1 if x > b +// - otherwise returns smooth interpolation between 0 and 1 based on x +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float smoothstep(float a, float b, float x) +{ + float y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(3.0f - (2.0f*y))); +} +inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) +{ + float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y))); +} +inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) +{ + float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y))); +} +inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) +{ + float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y))); +} + +#endif diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_string.h b/src/algorithms/tracking/libs/cudahelpers/helper_string.h new file mode 100644 index 000000000..c734314c6 --- /dev/null +++ b/src/algorithms/tracking/libs/cudahelpers/helper_string.h @@ -0,0 +1,516 @@ +/** + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef STRING_HELPER_H +#define STRING_HELPER_H + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) +{ + int string_start = 0; + + while (string[string_start] == delimiter) + { + string_start++; + } + + if (string_start >= (int)strlen(string)-1) + { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) +{ + int string_length = (int)strlen(filename); + + while (filename[string_length--] != '.') + { + if (string_length == 0) + break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + + +inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) +{ + bool bFound = false; + + if (argc >= 1) + { + for (int i=1; i < argc; i++) + { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = (int)strlen(string_ref); + + if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) + { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value) +{ + bool bFound = false; + + if (argc >= 1) + { + for (int i=1; i < argc; i++) + { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = (int)strlen(string_ref); + + if (!STRNCASECMP(string_argv, string_ref, length)) + { + if (length+1 <= (int)strlen(string_argv)) + { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i=argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) +{ + bool bFound = false; + int value = -1; + + if (argc >= 1) + { + for (int i=1; i < argc; i++) + { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = (int)strlen(string_ref); + + if (!STRNCASECMP(string_argv, string_ref, length)) + { + if (length+1 <= (int)strlen(string_argv)) + { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } + else + { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) + { + return value; + } + else + { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref) +{ + bool bFound = false; + float value = -1; + + if (argc >= 1) + { + for (int i=1; i < argc; i++) + { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = (int)strlen(string_ref); + + if (!STRNCASECMP(string_argv, string_ref, length)) + { + if (length+1 <= (int)strlen(string_argv)) + { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = (float)atof(&string_argv[length + auto_inc]); + } + else + { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) + { + return value; + } + else + { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, char **string_retval) +{ + bool bFound = false; + + if (argc >= 1) + { + for (int i=1; i < argc; i++) + { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = (char *)&argv[i][string_start]; + int length = (int)strlen(string_ref); + + if (!STRNCASECMP(string_argv, string_ref, length)) + { + *string_retval = &string_argv[length+1]; + bFound = true; + continue; + } + } + } + + if (!bFound) + { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, const char *executable_path) +{ + // defines a variable that is replaced with the name of the executable + + // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files) + // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc + const char *searchPath[] = + { + "./", // same dir + "./common/", // "/common/" subdir + "./common/data/", // "/common/data/" subdir + "./data/", // "/data/" subdir + "./src/", // "/src/" subdir + "./src//data/", // "/src//data/" subdir + "./inc/", // "/inc/" subdir + "./0_Simple/", // "/0_Simple/" subdir + "./1_Utilities/", // "/1_Utilities/" subdir + "./2_Graphics/", // "/2_Graphics/" subdir + "./3_Imaging/", // "/3_Imaging/" subdir + "./4_Finance/", // "/4_Finance/" subdir + "./5_Simulations/", // "/5_Simulations/" subdir + "./6_Advanced/", // "/6_Advanced/" subdir + "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir + "./8_Android/", // "/8_Android/" subdir + "./samples/", // "/samples/" subdir + + "../", // up 1 in tree + "../common/", // up 1 in tree, "/common/" subdir + "../common/data/", // up 1 in tree, "/common/data/" subdir + "../data/", // up 1 in tree, "/data/" subdir + "../src/", // up 1 in tree, "/src/" subdir + "../inc/", // up 1 in tree, "/inc/" subdir + + "../0_Simple//data/", // up 1 in tree, "/0_Simple//" subdir + "../1_Utilities//data/", // up 1 in tree, "/1_Utilities//" subdir + "../2_Graphics//data/", // up 1 in tree, "/2_Graphics//" subdir + "../3_Imaging//data/", // up 1 in tree, "/3_Imaging//" subdir + "../4_Finance//data/", // up 1 in tree, "/4_Finance//" subdir + "../5_Simulations//data/", // up 1 in tree, "/5_Simulations//" subdir + "../6_Advanced//data/", // up 1 in tree, "/6_Advanced//" subdir + "../7_CUDALibraries//data/",// up 1 in tree, "/7_CUDALibraries//" subdir + "../8_Android//data/", // up 1 in tree, "/8_Android//" subdir + "../samples//data/", // up 1 in tree, "/samples//" subdir + "../../", // up 2 in tree + "../../common/", // up 2 in tree, "/common/" subdir + "../../common/data/", // up 2 in tree, "/common/data/" subdir + "../../data/", // up 2 in tree, "/data/" subdir + "../../src/", // up 2 in tree, "/src/" subdir + "../../inc/", // up 2 in tree, "/inc/" subdir + "../../sandbox//data/", // up 2 in tree, "/sandbox//" subdir + "../../0_Simple//data/", // up 2 in tree, "/0_Simple//" subdir + "../../1_Utilities//data/", // up 2 in tree, "/1_Utilities//" subdir + "../../2_Graphics//data/", // up 2 in tree, "/2_Graphics//" subdir + "../../3_Imaging//data/", // up 2 in tree, "/3_Imaging//" subdir + "../../4_Finance//data/", // up 2 in tree, "/4_Finance//" subdir + "../../5_Simulations//data/", // up 2 in tree, "/5_Simulations//" subdir + "../../6_Advanced//data/", // up 2 in tree, "/6_Advanced//" subdir + "../../7_CUDALibraries//data/", // up 2 in tree, "/7_CUDALibraries//" subdir + "../../8_Android//data/", // up 2 in tree, "/8_Android//" subdir + "../../samples//data/", // up 2 in tree, "/samples//" subdir + "../../../", // up 3 in tree + "../../../src//", // up 3 in tree, "/src//" subdir + "../../../src//data/", // up 3 in tree, "/src//data/" subdir + "../../../src//src/", // up 3 in tree, "/src//src/" subdir + "../../../src//inc/", // up 3 in tree, "/src//inc/" subdir + "../../../sandbox//", // up 3 in tree, "/sandbox//" subdir + "../../../sandbox//data/", // up 3 in tree, "/sandbox//data/" subdir + "../../../sandbox//src/", // up 3 in tree, "/sandbox//src/" subdir + "../../../sandbox//inc/", // up 3 in tree, "/sandbox//inc/" subdir + "../../../0_Simple//data/", // up 3 in tree, "/0_Simple//" subdir + "../../../1_Utilities//data/", // up 3 in tree, "/1_Utilities//" subdir + "../../../2_Graphics//data/", // up 3 in tree, "/2_Graphics//" subdir + "../../../3_Imaging//data/", // up 3 in tree, "/3_Imaging//" subdir + "../../../4_Finance//data/", // up 3 in tree, "/4_Finance//" subdir + "../../../5_Simulations//data/", // up 3 in tree, "/5_Simulations//" subdir + "../../../6_Advanced//data/", // up 3 in tree, "/6_Advanced//" subdir + "../../../7_CUDALibraries//data/", // up 3 in tree, "/7_CUDALibraries//" subdir + "../../../8_Android//data/", // up 3 in tree, "/8_Android//" subdir + "../../../0_Simple//", // up 3 in tree, "/0_Simple//" subdir + "../../../1_Utilities//", // up 3 in tree, "/1_Utilities//" subdir + "../../../2_Graphics//", // up 3 in tree, "/2_Graphics//" subdir + "../../../3_Imaging//", // up 3 in tree, "/3_Imaging//" subdir + "../../../4_Finance//", // up 3 in tree, "/4_Finance//" subdir + "../../../5_Simulations//", // up 3 in tree, "/5_Simulations//" subdir + "../../../6_Advanced//", // up 3 in tree, "/6_Advanced//" subdir + "../../../7_CUDALibraries//", // up 3 in tree, "/7_CUDALibraries//" subdir + "../../../8_Android//", // up 3 in tree, "/8_Android//" subdir + "../../../samples//data/", // up 3 in tree, "/samples//" subdir + "../../../common/", // up 3 in tree, "../../../common/" subdir + "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir + "../../../data/", // up 3 in tree, "../../../data/" subdir + "../../../../", // up 4 in tree + "../../../../src//", // up 4 in tree, "/src//" subdir + "../../../../src//data/", // up 4 in tree, "/src//data/" subdir + "../../../../src//src/", // up 4 in tree, "/src//src/" subdir + "../../../../src//inc/", // up 4 in tree, "/src//inc/" subdir + "../../../../sandbox//", // up 4 in tree, "/sandbox//" subdir + "../../../../sandbox//data/", // up 4 in tree, "/sandbox//data/" subdir + "../../../../sandbox//src/", // up 4 in tree, "/sandbox//src/" subdir + "../../../../sandbox//inc/", // up 4 in tree, "/sandbox//inc/" subdir + "../../../../0_Simple//data/", // up 4 in tree, "/0_Simple//" subdir + "../../../../1_Utilities//data/", // up 4 in tree, "/1_Utilities//" subdir + "../../../../2_Graphics//data/", // up 4 in tree, "/2_Graphics//" subdir + "../../../../3_Imaging//data/", // up 4 in tree, "/3_Imaging//" subdir + "../../../../4_Finance//data/", // up 4 in tree, "/4_Finance//" subdir + "../../../../5_Simulations//data/",// up 4 in tree, "/5_Simulations//" subdir + "../../../../6_Advanced//data/", // up 4 in tree, "/6_Advanced//" subdir + "../../../../7_CUDALibraries//data/", // up 4 in tree, "/7_CUDALibraries//" subdir + "../../../../8_Android//data/", // up 4 in tree, "/8_Android//" subdir + "../../../../0_Simple//", // up 4 in tree, "/0_Simple//" subdir + "../../../../1_Utilities//", // up 4 in tree, "/1_Utilities//" subdir + "../../../../2_Graphics//", // up 4 in tree, "/2_Graphics//" subdir + "../../../../3_Imaging//", // up 4 in tree, "/3_Imaging//" subdir + "../../../../4_Finance//", // up 4 in tree, "/4_Finance//" subdir + "../../../../5_Simulations//",// up 4 in tree, "/5_Simulations//" subdir + "../../../../6_Advanced//", // up 4 in tree, "/6_Advanced//" subdir + "../../../../7_CUDALibraries//", // up 4 in tree, "/7_CUDALibraries//" subdir + "../../../../8_Android//", // up 4 in tree, "/8_Android//" subdir + "../../../../samples//data/", // up 4 in tree, "/samples//" subdir + "../../../../common/", // up 4 in tree, "../../../common/" subdir + "../../../../common/data/", // up 4 in tree, "../../../common/data/" subdir + "../../../../data/", // up 4 in tree, "../../../data/" subdir + "../../../../../", // up 5 in tree + "../../../../../src//", // up 5 in tree, "/src//" subdir + "../../../../../src//data/", // up 5 in tree, "/src//data/" subdir + "../../../../../src//src/", // up 5 in tree, "/src//src/" subdir + "../../../../../src//inc/", // up 5 in tree, "/src//inc/" subdir + "../../../../../sandbox//", // up 5 in tree, "/sandbox//" subdir + "../../../../../sandbox//data/", // up 5 in tree, "/sandbox//data/" subdir + "../../../../../sandbox//src/", // up 5 in tree, "/sandbox//src/" subdir + "../../../../../sandbox//inc/", // up 5 in tree, "/sandbox//inc/" subdir + "../../../../../0_Simple//data/", // up 5 in tree, "/0_Simple//" subdir + "../../../../../1_Utilities//data/", // up 5 in tree, "/1_Utilities//" subdir + "../../../../../2_Graphics//data/", // up 5 in tree, "/2_Graphics//" subdir + "../../../../../3_Imaging//data/", // up 5 in tree, "/3_Imaging//" subdir + "../../../../../4_Finance//data/", // up 5 in tree, "/4_Finance//" subdir + "../../../../../5_Simulations//data/",// up 5 in tree, "/5_Simulations//" subdir + "../../../../../6_Advanced//data/", // up 5 in tree, "/6_Advanced//" subdir + "../../../../../7_CUDALibraries//data/", // up 5 in tree, "/7_CUDALibraries//" subdir + "../../../../../8_Android//data/", // up 5 in tree, "/8_Android//" subdir + "../../../../../samples//data/", // up 5 in tree, "/samples//" subdir + "../../../../../common/", // up 5 in tree, "../../../common/" subdir + "../../../../../common/data/", // up 5 in tree, "../../../common/data/" subdir + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) + { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) + { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0,delimiter_pos+1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i) + { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) + { + if (executable_path != 0) + { + path.replace(executable_name_pos, strlen(""), executable_name); + } + else + { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) + { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = (char *) malloc(path.length() + 1); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) + { + fclose(fp); + } + } + + // File not found + return 0; +} + +#endif diff --git a/src/algorithms/tracking/libs/cudahelpers/helper_timer.h b/src/algorithms/tracking/libs/cudahelpers/helper_timer.h new file mode 100644 index 000000000..39ddc77f2 --- /dev/null +++ b/src/algorithms/tracking/libs/cudahelpers/helper_timer.h @@ -0,0 +1,499 @@ +/** + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// Helper Timing Functions +#ifndef HELPER_TIMER_H +#define HELPER_TIMER_H + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// includes, system +#include + +// includes, project +#include + +// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions +// But rather in a self contained class interface +class StopWatchInterface +{ + public: + StopWatchInterface() {}; + virtual ~StopWatchInterface() {}; + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface +{ + public: + //! Constructor, default + StopWatchWin() : + start_time(), end_time(), + diff_time(0.0f), total_time(0.0f), + running(false), clock_sessions(0), freq(0), freq_set(false) + { + if (! freq_set) + { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency((LARGE_INTEGER *) &temp); + + // convert to type in which it is needed + freq = ((double) temp.QuadPart) / 1000.0; + + // rememeber query + freq_set = true; + } + }; + + // Destructor + ~StopWatchWin() { }; + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::start() +{ + QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::stop() +{ + QueryPerformanceCounter((LARGE_INTEGER *) &end_time); + diff_time = (float) + (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchWin::reset() +{ + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + { + QueryPerformanceCounter((LARGE_INTEGER *) &start_time); + } +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchWin::getTime() +{ + // Return the TOTAL time to date + float retval = total_time; + + if (running) + { + LARGE_INTEGER temp; + QueryPerformanceCounter((LARGE_INTEGER *) &temp); + retval += (float) + (((double)(temp.QuadPart - start_time.QuadPart)) / freq); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchWin::getAverageTime() +{ + return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface +{ + public: + //! Constructor, default + StopWatchLinux() : + start_time(), diff_time(0.0), total_time(0.0), + running(false), clock_sessions(0) + { }; + + // Destructor + virtual ~StopWatchLinux() + { }; + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::start() +{ + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::stop() +{ + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void +StopWatchLinux::reset() +{ + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) + { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getTime() +{ + // Return the TOTAL time to date + float retval = total_time; + + if (running) + { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getAverageTime() +{ + return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float +StopWatchLinux::getDiffTime() +{ + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkCreateTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + *timer_interface = (StopWatchInterface *)new StopWatchWin(); +#else + *timer_interface = (StopWatchInterface *)new StopWatchLinux(); +#endif + return (*timer_interface != NULL) ? true : false; +} + + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkDeleteTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkStartTimer(StopWatchInterface **timer_interface) +{ + //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkStopTimer(StopWatchInterface **timer_interface) +{ + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool +sdkResetTimer(StopWatchInterface **timer_interface) +{ + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float +sdkGetAverageTimerValue(StopWatchInterface **timer_interface) +{ + // printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + return (*timer_interface)->getAverageTime(); + } + else + { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float +sdkGetTimerValue(StopWatchInterface **timer_interface) +{ + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) + { + return (*timer_interface)->getTime(); + } + else + { + return 0.0f; + } +} + +#endif // HELPER_TIMER_H From f722f5f8f779cdfa02519a7dd83b89b3aef375ac Mon Sep 17 00:00:00 2001 From: Javier Arribas Date: Thu, 23 Jul 2015 18:07:52 +0200 Subject: [PATCH 2/5] Added GPU tracking block for GPS L1 CA. Not activable yet... --- .../tracking/gnuradio_blocks/CMakeLists.txt | 8 + .../gps_l1_ca_dll_pll_tracking_gpu_cc.cc | 670 ++++++++++++++++++ .../gps_l1_ca_dll_pll_tracking_gpu_cc.h | 191 +++++ src/algorithms/tracking/libs/CMakeLists.txt | 1 + 4 files changed, 870 insertions(+) create mode 100644 src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc create mode 100644 src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h diff --git a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt index 268e1f9b5..d300a9edd 100644 --- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt +++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt @@ -16,6 +16,11 @@ # along with GNSS-SDR. If not, see . # + +if(ENABLE_CUDA) + FIND_PACKAGE(CUDA REQUIRED) +endif(ENABLE_CUDA) + set(TRACKING_GR_BLOCKS_SOURCES galileo_e1_dll_pll_veml_tracking_cc.cc galileo_volk_e1_dll_pll_veml_tracking_cc.cc @@ -26,6 +31,7 @@ set(TRACKING_GR_BLOCKS_SOURCES gps_l1_ca_tcp_connector_tracking_cc.cc galileo_e5a_dll_pll_tracking_cc.cc gps_l2_m_dll_pll_tracking_cc.cc + gps_l1_ca_dll_pll_tracking_gpu_cc.cc ) include_directories( @@ -40,6 +46,8 @@ include_directories( ${Boost_INCLUDE_DIRS} ${GNURADIO_RUNTIME_INCLUDE_DIRS} ${VOLK_GNSSSDR_INCLUDE_DIRS} + ${CUDA_INCLUDE_DIRS} + ${CMAKE_SOURCE_DIR}/src/algorithms/tracking/libs/cudahelpers ) if(ENABLE_GENERIC_ARCH) diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc new file mode 100644 index 000000000..b443aebf5 --- /dev/null +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc @@ -0,0 +1,670 @@ +/*! + * \file gps_l1_ca_dll_pll_tracking_gpu_cc.cc + * \brief Implementation of a code DLL + carrier PLL tracking block, GPU ACCELERATED + * \author Javier Arribas, 2015. jarribas(at)cttc.es + * + * Code DLL + carrier PLL according to the algorithms described in: + * [1] K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * A Software-Defined GPS and Galileo Receiver. A Single-Frequency + * Approach, Birkhauser, 2007 + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#include "gps_l1_ca_dll_pll_tracking_gpu_cc.h" +#include +#include +#include +#include +#include +#include +#include +#include "gnss_synchro.h" +#include "gps_sdr_signal_processing.h" +#include "tracking_discriminators.h" +#include "lock_detectors.h" +#include "GPS_L1_CA.h" +#include "control_message_factory.h" +#include //volk_alignement + +#include +// CUDA runtime +#include +// includes +#include +#include // helper for shared functions common to CUDA Samples +#include // helper functions for CUDA error checking and initialization + + + +/*! + * \todo Include in definition header file + */ +#define CN0_ESTIMATION_SAMPLES 20 +#define MINIMUM_VALID_CN0 25 +#define MAXIMUM_LOCK_FAIL_COUNTER 50 +#define CARRIER_LOCK_THRESHOLD 0.85 + + +using google::LogMessage; + +gps_l1_ca_dll_pll_tracking_gpu_cc_sptr +gps_l1_ca_dll_pll_make_tracking_gpu_cc( + long if_freq, + long fs_in, + unsigned int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips) +{ + return gps_l1_ca_dll_pll_tracking_gpu_cc_sptr(new Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(if_freq, + fs_in, vector_length, queue, dump, dump_filename, pll_bw_hz, dll_bw_hz, early_late_space_chips)); +} + + + +void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::forecast (int noutput_items, + gr_vector_int &ninput_items_required) +{ + ninput_items_required[0] = static_cast(d_vector_length) * 2; //set the required available samples in each call +} + + + +Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc( + long if_freq, + long fs_in, + unsigned int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips) : + gr::block("Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc", gr::io_signature::make(1, 1, sizeof(gr_complex)), + gr::io_signature::make(1, 1, sizeof(Gnss_Synchro))) +{ + // initialize internal vars + d_queue = queue; + d_dump = dump; + d_if_freq = if_freq; + d_fs_in = fs_in; + d_vector_length = vector_length; + d_dump_filename = dump_filename; + + // Initialize tracking ========================================== + d_code_loop_filter.set_DLL_BW(dll_bw_hz); + d_carrier_loop_filter.set_PLL_BW(pll_bw_hz); + + //--- DLL variables -------------------------------------------------------- + d_early_late_spc_chips = early_late_space_chips; // Define early-late offset (in chips) + + // Initialization of local code replica + // Get space for a vector with the C/A code replica sampled 1x/chip + d_ca_code = static_cast(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS + 2) * sizeof(gr_complex), volk_get_alignment())); + + + multicorrelator_gpu = new cuda_multicorrelator(); + // Get space for the resampled early / prompt / late local replicas + int N_CORRELATORS=3; + checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_samples, N_CORRELATORS * sizeof(int), cudaHostAllocMapped )); + + + //allocate host memory + //pinned memory mode - use special function to get OS-pinned memory + checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length * sizeof(gr_complex), cudaHostAllocMapped )); + + //old local codes vector + //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined )); + + //new integrated shifts + checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined )); + + // correlator outputs (scalar) + checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS, cudaHostAllocWriteCombined )); + //map to EPL pointers + d_Early = &d_corr_outs_gpu[0]; + d_Prompt = &d_corr_outs_gpu[1]; + d_Late = &d_corr_outs_gpu[2]; + + //--- Perform initializations ------------------------------ + // define initial code frequency basis of NCO + d_code_freq_chips = GPS_L1_CA_CODE_RATE_HZ; + // define residual code phase (in chips) + d_rem_code_phase_samples = 0.0; + // define residual carrier phase + d_rem_carr_phase_rad = 0.0; + + // sample synchronization + d_sample_counter = 0; + //d_sample_counter_seconds = 0; + d_acq_sample_stamp = 0; + + d_enable_tracking = false; + d_pull_in = false; + d_last_seg = 0; + + d_current_prn_length_samples = static_cast(d_vector_length); + + // CN0 estimation and lock detector buffers + d_cn0_estimation_counter = 0; + d_Prompt_buffer = new gr_complex[CN0_ESTIMATION_SAMPLES]; + d_carrier_lock_test = 1; + d_CN0_SNV_dB_Hz = 0; + d_carrier_lock_fail_counter = 0; + d_carrier_lock_threshold = CARRIER_LOCK_THRESHOLD; + + systemName["G"] = std::string("GPS"); + systemName["S"] = std::string("SBAS"); + + + set_relative_rate(1.0/((double)d_vector_length*2)); + + d_channel_internal_queue = 0; + d_acquisition_gnss_synchro = 0; + d_channel = 0; + d_acq_code_phase_samples = 0.0; + d_acq_carrier_doppler_hz = 0.0; + d_carrier_doppler_hz = 0.0; + d_acc_carrier_phase_rad = 0.0; + d_code_phase_samples = 0.0; + d_acc_code_phase_secs = 0.0; + //set_min_output_buffer((long int)300); +} + + +void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking() +{ + /* + * correct the code phase according to the delay between acq and trk + */ + d_acq_code_phase_samples = d_acquisition_gnss_synchro->Acq_delay_samples; + d_acq_carrier_doppler_hz = d_acquisition_gnss_synchro->Acq_doppler_hz; + d_acq_sample_stamp = d_acquisition_gnss_synchro->Acq_samplestamp_samples; + + long int acq_trk_diff_samples; + float acq_trk_diff_seconds; + acq_trk_diff_samples = static_cast(d_sample_counter) - static_cast(d_acq_sample_stamp);//-d_vector_length; + DLOG(INFO) << "Number of samples between Acquisition and Tracking =" << acq_trk_diff_samples; + acq_trk_diff_seconds = static_cast(acq_trk_diff_samples) / static_cast(d_fs_in); + //doppler effect + // Fd=(C/(C+Vr))*F + float radial_velocity = (GPS_L1_FREQ_HZ + d_acq_carrier_doppler_hz) / GPS_L1_FREQ_HZ; + // new chip and prn sequence periods based on acq Doppler + float T_chip_mod_seconds; + float T_prn_mod_seconds; + float T_prn_mod_samples; + d_code_freq_chips = radial_velocity * GPS_L1_CA_CODE_RATE_HZ; + T_chip_mod_seconds = 1/d_code_freq_chips; + T_prn_mod_seconds = T_chip_mod_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS; + T_prn_mod_samples = T_prn_mod_seconds * static_cast(d_fs_in); + + d_current_prn_length_samples = round(T_prn_mod_samples); + + float T_prn_true_seconds = GPS_L1_CA_CODE_LENGTH_CHIPS / GPS_L1_CA_CODE_RATE_HZ; + float T_prn_true_samples = T_prn_true_seconds * static_cast(d_fs_in); + float T_prn_diff_seconds= T_prn_true_seconds - T_prn_mod_seconds; + float N_prn_diff = acq_trk_diff_seconds / T_prn_true_seconds; + float corrected_acq_phase_samples, delay_correction_samples; + corrected_acq_phase_samples = fmod((d_acq_code_phase_samples + T_prn_diff_seconds * N_prn_diff * static_cast(d_fs_in)), T_prn_true_samples); + if (corrected_acq_phase_samples < 0) + { + corrected_acq_phase_samples = T_prn_mod_samples + corrected_acq_phase_samples; + } + delay_correction_samples = d_acq_code_phase_samples - corrected_acq_phase_samples; + + d_acq_code_phase_samples = corrected_acq_phase_samples; + + d_carrier_doppler_hz = d_acq_carrier_doppler_hz; + + // DLL/PLL filter initialization + d_carrier_loop_filter.initialize(); // initialize the carrier filter + d_code_loop_filter.initialize(); // initialize the code filter + + // generate local reference ALWAYS starting at chip 1 (1 sample per chip) + gps_l1_ca_code_gen_complex(&d_ca_code[1], d_acquisition_gnss_synchro->PRN, 0); + d_ca_code[0] = d_ca_code[static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS)]; + d_ca_code[static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS) + 1] = d_ca_code[1]; + + d_carrier_lock_fail_counter = 0; + d_rem_code_phase_samples = 0; + d_rem_carr_phase_rad = 0; + d_acc_carrier_phase_rad = 0; + d_acc_code_phase_secs = 0; + + d_code_phase_samples = d_acq_code_phase_samples; + + std::string sys_ = &d_acquisition_gnss_synchro->System; + sys = sys_.substr(0,1); + + // DEBUG OUTPUT + std::cout << "Tracking start on channel " << d_channel << " for satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) << std::endl; + LOG(INFO) << "Starting tracking of satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) << " on channel " << d_channel; + + + // enable tracking + d_pull_in = true; + d_enable_tracking = true; + + LOG(INFO) << "PULL-IN Doppler [Hz]=" << d_carrier_doppler_hz + << " Code Phase correction [samples]=" << delay_correction_samples + << " PULL-IN Code Phase [samples]=" << d_acq_code_phase_samples; +} + + + + + +void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::update_local_code() +{ + double tcode_chips; + double rem_code_phase_chips; + int associated_chip_index; + int code_length_chips = static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS); + double code_phase_step_chips; + int epl_loop_length_samples; + + // unified loop for E, P, L code vectors + code_phase_step_chips = static_cast(d_code_freq_chips) / static_cast(d_fs_in); + rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in); + tcode_chips = -rem_code_phase_chips; + + // Alternative EPL code generation (40% of speed improvement!) + d_local_code_shift_samples[0]=0; + d_local_code_shift_samples[1]=round(d_early_late_spc_chips / code_phase_step_chips); + d_local_code_shift_samples[2]=round((2*d_early_late_spc_chips) / code_phase_step_chips); + + epl_loop_length_samples = d_current_prn_length_samples + d_local_code_shift_samples[2]; //maximum length + + for (int i = 0; i < epl_loop_length_samples; i++) + { + associated_chip_index = 1 + round(fmod(tcode_chips - d_early_late_spc_chips, code_length_chips)); + d_local_codes_gpu[i] = d_ca_code[associated_chip_index]; + tcode_chips = tcode_chips + code_phase_step_chips; + } + +} + + +Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc() +{ + d_dump_file.close(); + + cudaFreeHost(in_gpu); + cudaFreeHost(d_carr_sign_gpu); + cudaFreeHost(d_corr_outs_gpu); + cudaFreeHost(d_local_codes_gpu); + + multicorrelator_gpu->free_cuda(); + delete(multicorrelator_gpu); + + volk_free(d_Early); + volk_free(d_Prompt); + volk_free(d_Late); + volk_free(d_ca_code); + + delete[] d_Prompt_buffer; +} + + + +int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vector_int &ninput_items, + gr_vector_const_void_star &input_items, gr_vector_void_star &output_items) +{ + // process vars + float carr_error_hz; + float carr_error_filt_hz; + float code_error_chips; + float code_error_filt_chips; + + // Block input data and block output stream pointers + const gr_complex* in = (gr_complex*) input_items[0]; + Gnss_Synchro **out = (Gnss_Synchro **) &output_items[0]; + + // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder + Gnss_Synchro current_synchro_data = Gnss_Synchro(); + + + if (d_enable_tracking == true) + { + // Receiver signal alignment + if (d_pull_in == true) + { + int samples_offset; + float acq_trk_shif_correction_samples; + int acq_to_trk_delay_samples; + acq_to_trk_delay_samples = d_sample_counter - d_acq_sample_stamp; + acq_trk_shif_correction_samples = d_current_prn_length_samples - fmod(static_cast(acq_to_trk_delay_samples), static_cast(d_current_prn_length_samples)); + samples_offset = round(d_acq_code_phase_samples + acq_trk_shif_correction_samples); + // /todo: Check if the sample counter sent to the next block as a time reference should be incremented AFTER sended or BEFORE + //d_sample_counter_seconds = d_sample_counter_seconds + (((double)samples_offset) / static_cast(d_fs_in)); + d_sample_counter = d_sample_counter + samples_offset; //count for the processed samples + d_pull_in = false; + //std::cout<<" samples_offset="<(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast(d_fs_in); + + // perform carrier wipe-off and compute Early, Prompt and Late correlation + multicorrelator_gpu->Carrier_wipeoff_multicorrelator_cuda( + d_corr_outs_gpu, + in,//in_gpu, + d_local_codes_gpu, + d_rem_carr_phase_rad, + phase_step_rad, + d_local_code_shift_samples, + d_current_prn_length_samples, + 3); + + // check for samples consistency (this should be done before in the receiver / here only if the source is a file) + if (std::isnan((*d_Prompt).real()) == true or std::isnan((*d_Prompt).imag()) == true ) // or std::isinf(in[i].real())==true or std::isinf(in[i].imag())==true) + { + const int samples_available = ninput_items[0]; + d_sample_counter = d_sample_counter + samples_available; + LOG(WARNING) << "Detected NaN samples at sample number " << d_sample_counter; + consume_each(samples_available); + + // make an output to not stop the rest of the processing blocks + current_synchro_data.Prompt_I = 0.0; + current_synchro_data.Prompt_Q = 0.0; + current_synchro_data.Tracking_timestamp_secs = static_cast(d_sample_counter) / static_cast(d_fs_in); + current_synchro_data.Carrier_phase_rads = 0.0; + current_synchro_data.Code_phase_secs = 0.0; + current_synchro_data.CN0_dB_hz = 0.0; + current_synchro_data.Flag_valid_tracking = false; + current_synchro_data.Flag_valid_pseudorange = false; + + *out[0] = current_synchro_data; + return 1; + } + + // ################## PLL ########################################################## + // PLL discriminator + carr_error_hz = pll_cloop_two_quadrant_atan(*d_Prompt) / static_cast(GPS_TWO_PI); + // Carrier discriminator filter + carr_error_filt_hz = d_carrier_loop_filter.get_carrier_nco(carr_error_hz); + // New carrier Doppler frequency estimation + d_carrier_doppler_hz = d_acq_carrier_doppler_hz + carr_error_filt_hz; + // New code Doppler frequency estimation + d_code_freq_chips = GPS_L1_CA_CODE_RATE_HZ + ((d_carrier_doppler_hz * GPS_L1_CA_CODE_RATE_HZ) / GPS_L1_FREQ_HZ); + //carrier phase accumulator for (K) doppler estimation + d_acc_carrier_phase_rad = d_acc_carrier_phase_rad + GPS_TWO_PI * d_carrier_doppler_hz * GPS_L1_CA_CODE_PERIOD; + //remanent carrier phase to prevent overflow in the code NCO + d_rem_carr_phase_rad = d_rem_carr_phase_rad + GPS_TWO_PI * d_carrier_doppler_hz * GPS_L1_CA_CODE_PERIOD; + d_rem_carr_phase_rad = fmod(d_rem_carr_phase_rad, GPS_TWO_PI); + + // ################## DLL ########################################################## + // DLL discriminator + code_error_chips = dll_nc_e_minus_l_normalized(*d_Early, *d_Late); //[chips/Ti] + // Code discriminator filter + code_error_filt_chips = d_code_loop_filter.get_code_nco(code_error_chips); //[chips/second] + //Code phase accumulator + float code_error_filt_secs; + code_error_filt_secs = (GPS_L1_CA_CODE_PERIOD * code_error_filt_chips) / GPS_L1_CA_CODE_RATE_HZ; //[seconds] + d_acc_code_phase_secs = d_acc_code_phase_secs + code_error_filt_secs; + + // ################## CARRIER AND CODE NCO BUFFER ALIGNEMENT ####################### + // keep alignment parameters for the next input buffer + double T_chip_seconds; + double T_prn_seconds; + double T_prn_samples; + double K_blk_samples; + // Compute the next buffer length based in the new period of the PRN sequence and the code phase error estimation + T_chip_seconds = 1 / static_cast(d_code_freq_chips); + T_prn_seconds = T_chip_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS; + T_prn_samples = T_prn_seconds * static_cast(d_fs_in); + K_blk_samples = T_prn_samples + d_rem_code_phase_samples + code_error_filt_secs * static_cast(d_fs_in); + d_current_prn_length_samples = round(K_blk_samples); //round to a discrete samples + //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample + + // ####### CN0 ESTIMATION AND LOCK DETECTORS ###### + if (d_cn0_estimation_counter < CN0_ESTIMATION_SAMPLES) + { + // fill buffer with prompt correlator output values + d_Prompt_buffer[d_cn0_estimation_counter] = *d_Prompt; + d_cn0_estimation_counter++; + } + else + { + d_cn0_estimation_counter = 0; + // Code lock indicator + d_CN0_SNV_dB_Hz = cn0_svn_estimator(d_Prompt_buffer, CN0_ESTIMATION_SAMPLES, d_fs_in, GPS_L1_CA_CODE_LENGTH_CHIPS); + // Carrier lock indicator + d_carrier_lock_test = carrier_lock_detector(d_Prompt_buffer, CN0_ESTIMATION_SAMPLES); + // Loss of lock detection + if (d_carrier_lock_test < d_carrier_lock_threshold or d_CN0_SNV_dB_Hz < MINIMUM_VALID_CN0) + { + d_carrier_lock_fail_counter++; + } + else + { + if (d_carrier_lock_fail_counter > 0) d_carrier_lock_fail_counter--; + } + if (d_carrier_lock_fail_counter > MAXIMUM_LOCK_FAIL_COUNTER) + { + std::cout << "Loss of lock in channel " << d_channel << "!" << std::endl; + LOG(INFO) << "Loss of lock in channel " << d_channel << "!"; + std::unique_ptr cmf(new ControlMessageFactory()); + if (d_queue != gr::msg_queue::sptr()) + { + d_queue->handle(cmf->GetQueueMessage(d_channel, 2)); + } + d_carrier_lock_fail_counter = 0; + d_enable_tracking = false; // TODO: check if disabling tracking is consistent with the channel state machine + } + } + // ########### Output the tracking data to navigation and PVT ########## + current_synchro_data.Prompt_I = static_cast((*d_Prompt).real()); + current_synchro_data.Prompt_Q = static_cast((*d_Prompt).imag()); + + // Tracking_timestamp_secs is aligned with the NEXT PRN start sample (Hybridization problem!) + //compute remnant code phase samples BEFORE the Tracking timestamp + //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample + //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter + (double)d_current_prn_length_samples + (double)d_rem_code_phase_samples)/static_cast(d_fs_in); + + // Tracking_timestamp_secs is aligned with the CURRENT PRN start sample (Hybridization OK!, but some glitches??) + current_synchro_data.Tracking_timestamp_secs = (static_cast(d_sample_counter) + static_cast(d_rem_code_phase_samples)) / static_cast(d_fs_in); + //compute remnant code phase samples AFTER the Tracking timestamp + d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample + + //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter)/static_cast(d_fs_in); + // This tracking block aligns the Tracking_timestamp_secs with the start sample of the PRN, thus, Code_phase_secs=0 + current_synchro_data.Code_phase_secs = 0; + current_synchro_data.Carrier_phase_rads = static_cast(d_acc_carrier_phase_rad); + current_synchro_data.Carrier_Doppler_hz = static_cast(d_carrier_doppler_hz); + current_synchro_data.CN0_dB_hz = static_cast(d_CN0_SNV_dB_Hz); + current_synchro_data.Flag_valid_pseudorange = false; + *out[0] = current_synchro_data; + + // ########## DEBUG OUTPUT + /*! + * \todo The stop timer has to be moved to the signal source! + */ + // debug: Second counter in channel 0 + if (d_channel == 0) + { + if (floor(d_sample_counter / d_fs_in) != d_last_seg) + { + d_last_seg = floor(d_sample_counter / d_fs_in); + std::cout << "Current input signal time = " << d_last_seg << " [s]" << std::endl; + DLOG(INFO) << "GPS L1 C/A Tracking CH " << d_channel << ": Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) + << ", CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]" << std::endl; + //if (d_last_seg==5) d_carrier_lock_fail_counter=500; //DEBUG: force unlock! + } + } + else + { + if (floor(d_sample_counter / d_fs_in) != d_last_seg) + { + d_last_seg = floor(d_sample_counter / d_fs_in); + DLOG(INFO) << "Tracking CH " << d_channel << ": Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) + << ", CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]"; + } + } + } + else + { + // ########## DEBUG OUTPUT (TIME ONLY for channel 0 when tracking is disabled) + /*! + * \todo The stop timer has to be moved to the signal source! + */ + // stream to collect cout calls to improve thread safety + std::stringstream tmp_str_stream; + if (floor(d_sample_counter / d_fs_in) != d_last_seg) + { + d_last_seg = floor(d_sample_counter / d_fs_in); + + if (d_channel == 0) + { + // debug: Second counter in channel 0 + tmp_str_stream << "Current input signal time = " << d_last_seg << " [s]" << std::endl << std::flush; + std::cout << tmp_str_stream.rdbuf() << std::flush; + } + } + *d_Early = gr_complex(0,0); + *d_Prompt = gr_complex(0,0); + *d_Late = gr_complex(0,0); + + current_synchro_data.System = {'G'}; + current_synchro_data.Flag_valid_pseudorange = false; + *out[0] = current_synchro_data; + } + + if(d_dump) + { + // MULTIPLEXED FILE RECORDING - Record results to file + float prompt_I; + float prompt_Q; + float tmp_E, tmp_P, tmp_L; + float tmp_float; + double tmp_double; + prompt_I = (*d_Prompt).real(); + prompt_Q = (*d_Prompt).imag(); + tmp_E = std::abs(*d_Early); + tmp_P = std::abs(*d_Prompt); + tmp_L = std::abs(*d_Late); + try + { + // EPR + d_dump_file.write(reinterpret_cast(&tmp_E), sizeof(float)); + d_dump_file.write(reinterpret_cast(&tmp_P), sizeof(float)); + d_dump_file.write(reinterpret_cast(&tmp_L), sizeof(float)); + // PROMPT I and Q (to analyze navigation symbols) + d_dump_file.write(reinterpret_cast(&prompt_I), sizeof(float)); + d_dump_file.write(reinterpret_cast(&prompt_Q), sizeof(float)); + // PRN start sample stamp + //tmp_float=(float)d_sample_counter; + d_dump_file.write(reinterpret_cast(&d_sample_counter), sizeof(unsigned long int)); + // accumulated carrier phase + d_dump_file.write(reinterpret_cast(&d_acc_carrier_phase_rad), sizeof(float)); + + // carrier and code frequency + d_dump_file.write(reinterpret_cast(&d_carrier_doppler_hz), sizeof(float)); + d_dump_file.write(reinterpret_cast(&d_code_freq_chips), sizeof(float)); + + //PLL commands + d_dump_file.write(reinterpret_cast(&carr_error_hz), sizeof(float)); + d_dump_file.write(reinterpret_cast(&carr_error_filt_hz), sizeof(float)); + + //DLL commands + d_dump_file.write(reinterpret_cast(&code_error_chips), sizeof(float)); + d_dump_file.write(reinterpret_cast(&code_error_filt_chips), sizeof(float)); + + // CN0 and carrier lock test + d_dump_file.write(reinterpret_cast(&d_CN0_SNV_dB_Hz), sizeof(float)); + d_dump_file.write(reinterpret_cast(&d_carrier_lock_test), sizeof(float)); + + // AUX vars (for debug purposes) + tmp_float = d_rem_code_phase_samples; + d_dump_file.write(reinterpret_cast(&tmp_float), sizeof(float)); + tmp_double = static_cast(d_sample_counter + d_current_prn_length_samples); + d_dump_file.write(reinterpret_cast(&tmp_double), sizeof(double)); + } + catch (std::ifstream::failure e) + { + LOG(WARNING) << "Exception writing trk dump file " << e.what(); + } + } + + consume_each(d_current_prn_length_samples); // this is necessary in gr::block derivates + d_sample_counter += d_current_prn_length_samples; //count for the processed samples + //LOG(INFO)<<"GPS tracking output end on CH="<d_channel << " SAMPLE STAMP="<(d_channel)); + d_dump_filename.append(".dat"); + d_dump_file.exceptions (std::ifstream::failbit | std::ifstream::badbit); + d_dump_file.open(d_dump_filename.c_str(), std::ios::out | std::ios::binary); + LOG(INFO) << "Tracking dump enabled on channel " << d_channel << " Log file: " << d_dump_filename.c_str() << std::endl; + } + catch (std::ifstream::failure e) + { + LOG(WARNING) << "channel " << d_channel << " Exception opening trk dump file " << e.what() << std::endl; + } + } + } +} + + + +void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::set_channel_queue(concurrent_queue *channel_internal_queue) +{ + d_channel_internal_queue = channel_internal_queue; +} + + +void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro) +{ + d_acquisition_gnss_synchro = p_gnss_synchro; +} diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h new file mode 100644 index 000000000..cf166fcff --- /dev/null +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h @@ -0,0 +1,191 @@ +/*! + * \file gps_l1_ca_dll_pll_tracking_gpu_cc.h + * \brief Implementation of a code DLL + carrier PLL tracking block, GPU ACCELERATED + * \author Javier Arribas, 2015. jarribas(at)cttc.es + * + * Code DLL + carrier PLL according to the algorithms described in: + * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * A Software-Defined GPS and Galileo Receiver. A Single-Frequency Approach, + * Birkhauser, 2007 + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_CC_H +#define GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_CC_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrent_queue.h" +#include "gps_sdr_signal_processing.h" +#include "gnss_synchro.h" +#include "tracking_2nd_DLL_filter.h" +#include "tracking_2nd_PLL_filter.h" +#include "cuda_multicorrelator.h" + +class Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc; + +typedef boost::shared_ptr + gps_l1_ca_dll_pll_tracking_gpu_cc_sptr; + +gps_l1_ca_dll_pll_tracking_gpu_cc_sptr +gps_l1_ca_dll_pll_make_tracking_gpu_cc(long if_freq, + long fs_in, unsigned + int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips); + + + +/*! + * \brief This class implements a DLL + PLL tracking loop block + */ +class Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc: public gr::block +{ +public: + ~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(); + + void set_channel(unsigned int channel); + void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro); + void start_tracking(); + void set_channel_queue(concurrent_queue *channel_internal_queue); + + int general_work (int noutput_items, gr_vector_int &ninput_items, + gr_vector_const_void_star &input_items, gr_vector_void_star &output_items); + + void forecast (int noutput_items, gr_vector_int &ninput_items_required); + +private: + friend gps_l1_ca_dll_pll_tracking_gpu_cc_sptr + gps_l1_ca_dll_pll_make_tracking_gpu_cc(long if_freq, + long fs_in, unsigned + int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips); + + Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(long if_freq, + long fs_in, unsigned + int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips); + void update_local_code(); + void update_local_carrier(); + + // tracking configuration vars + boost::shared_ptr d_queue; + concurrent_queue *d_channel_internal_queue; + unsigned int d_vector_length; + bool d_dump; + + Gnss_Synchro* d_acquisition_gnss_synchro; + unsigned int d_channel; + int d_last_seg; + long d_if_freq; + long d_fs_in; + + double d_early_late_spc_chips; + + + //GPU HOST PINNED MEMORY IN/OUT VECTORS + gr_complex* in_gpu; + gr_complex* d_carr_sign_gpu; + gr_complex* d_local_codes_gpu; + int* d_local_code_shift_samples; + gr_complex* d_corr_outs_gpu; + cuda_multicorrelator *multicorrelator_gpu; + + + gr_complex* d_ca_code; + + gr_complex *d_Early; + gr_complex *d_Prompt; + gr_complex *d_Late; + + + // remaining code phase and carrier phase between tracking loops + double d_rem_code_phase_samples; + float d_rem_carr_phase_rad; + + // PLL and DLL filter library + Tracking_2nd_DLL_filter d_code_loop_filter; + Tracking_2nd_PLL_filter d_carrier_loop_filter; + + // acquisition + float d_acq_code_phase_samples; + float d_acq_carrier_doppler_hz; + + // tracking vars + double d_code_freq_chips; + float d_carrier_doppler_hz; + float d_acc_carrier_phase_rad; + float d_code_phase_samples; + float d_acc_code_phase_secs; + + //PRN period in samples + int d_current_prn_length_samples; + + //processing samples counters + unsigned long int d_sample_counter; + unsigned long int d_acq_sample_stamp; + + // CN0 estimation and lock detector + int d_cn0_estimation_counter; + gr_complex* d_Prompt_buffer; + float d_carrier_lock_test; + float d_CN0_SNV_dB_Hz; + float d_carrier_lock_threshold; + int d_carrier_lock_fail_counter; + + // control vars + bool d_enable_tracking; + bool d_pull_in; + + // file dump + std::string d_dump_filename; + std::ofstream d_dump_file; + + std::map systemName; + std::string sys; +}; + +#endif //GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_CC_H diff --git a/src/algorithms/tracking/libs/CMakeLists.txt b/src/algorithms/tracking/libs/CMakeLists.txt index aa5fb0036..bb1a36c29 100644 --- a/src/algorithms/tracking/libs/CMakeLists.txt +++ b/src/algorithms/tracking/libs/CMakeLists.txt @@ -56,6 +56,7 @@ include_directories( ${CMAKE_SOURCE_DIR}/src/core/interfaces ${CMAKE_SOURCE_DIR}/src/core/receiver ${VOLK_INCLUDE_DIRS} + ${CUDA_INCLUDE_DIRS} ) if(ENABLE_GENERIC_ARCH) From 26cf90cdd488620b3b63ddc2002fe1be35abcd84 Mon Sep 17 00:00:00 2001 From: Javier Arribas Date: Fri, 24 Jul 2015 17:21:25 +0200 Subject: [PATCH 3/5] First working version of the GPU GPS tracking block (it requires NVIDIA CUDA 3.0 GPU hardware) --- conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf | 305 ++++++++++++++++++ .../tracking/adapters/CMakeLists.txt | 5 + .../gps_l1_ca_dll_pll_tracking_gpu.cc | 158 +++++++++ .../adapters/gps_l1_ca_dll_pll_tracking_gpu.h | 113 +++++++ .../tracking/gnuradio_blocks/CMakeLists.txt | 3 +- .../gps_l1_ca_dll_pll_tracking_gpu_cc.cc | 20 +- src/algorithms/tracking/libs/CMakeLists.txt | 8 +- .../tracking/libs/cuda_multicorrelator.cu | 31 +- .../tracking/libs/cuda_multicorrelator.h | 9 +- src/core/receiver/CMakeLists.txt | 7 +- src/core/receiver/gnss_block_factory.cc | 7 + src/main/CMakeLists.txt | 1 + 12 files changed, 631 insertions(+), 36 deletions(-) create mode 100644 conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf create mode 100644 src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.cc create mode 100644 src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h diff --git a/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf b/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf new file mode 100644 index 000000000..a8e576df7 --- /dev/null +++ b/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf @@ -0,0 +1,305 @@ +; Default configuration file +; You can define your own receiver and invoke it by doing +; gnss-sdr --config_file=my_GNSS_SDR_configuration.conf +; + +[GNSS-SDR] + +;######### GLOBAL OPTIONS ################## +;internal_fs_hz: Internal signal sampling frequency after the signal conditioning stage [Hz]. +GNSS-SDR.internal_fs_hz=4000000 + +;######### CONTROL_THREAD CONFIG ############ +ControlThread.wait_for_flowgraph=false + +;######### SIGNAL_SOURCE CONFIG ############ +;#implementation: Use [File_Signal_Source] or [UHD_Signal_Source] or [GN3S_Signal_Source] (experimental) +SignalSource.implementation=File_Signal_Source + +;#filename: path to file with the captured GNSS signal samples to be processed +SignalSource.filename=/home/javier/signals/4msps.dat + +;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version. +SignalSource.item_type=gr_complex + +;#sampling_frequency: Original Signal sampling frequency in [Hz] +SignalSource.sampling_frequency=4000000 + +;#freq: RF front-end center frequency in [Hz] +SignalSource.freq=1575420000 + +;#gain: Front-end Gain in [dB] +SignalSource.gain=60 + +;#subdevice: UHD subdevice specification (for USRP1 use A:0 or B:0) +SignalSource.subdevice=B:0 + +;#samples: Number of samples to be processed. Notice that 0 indicates the entire file. +SignalSource.samples=0 + +;#repeat: Repeat the processing file. Disable this option in this version +SignalSource.repeat=false + +;#dump: Dump the Signal source data to a file. Disable this option in this version +SignalSource.dump=false + +SignalSource.dump_filename=../data/signal_source.dat + + +;#enable_throttle_control: Enabling this option tells the signal source to keep the delay between samples in post processing. +; it helps to not overload the CPU, but the processing time will be longer. +SignalSource.enable_throttle_control=false + + +;######### SIGNAL_CONDITIONER CONFIG ############ +;## It holds blocks to change data type, filter and resample input data. + +;#implementation: Use [Pass_Through] or [Signal_Conditioner] +;#[Pass_Through] disables this block and the [DataTypeAdapter], [InputFilter] and [Resampler] blocks +;#[Signal_Conditioner] enables this block. Then you have to configure [DataTypeAdapter], [InputFilter] and [Resampler] blocks +;SignalConditioner.implementation=Signal_Conditioner +SignalConditioner.implementation=Pass_Through + +;######### DATA_TYPE_ADAPTER CONFIG ############ +;## Changes the type of input data. Please disable it in this version. +;#implementation: [Pass_Through] disables this block +DataTypeAdapter.implementation=Pass_Through + +;######### INPUT_FILTER CONFIG ############ +;## Filter the input data. Can be combined with frequency translation for IF signals + +;#implementation: Use [Pass_Through] or [Fir_Filter] or [Freq_Xlating_Fir_Filter] +;#[Pass_Through] disables this block +;#[Fir_Filter] enables a FIR Filter +;#[Freq_Xlating_Fir_Filter] enables FIR filter and a composite frequency translation that shifts IF down to zero Hz. + +;InputFilter.implementation=Fir_Filter +;InputFilter.implementation=Freq_Xlating_Fir_Filter +InputFilter.implementation=Pass_Through + +;#dump: Dump the filtered data to a file. +InputFilter.dump=false + +;#dump_filename: Log path and filename. +InputFilter.dump_filename=../data/input_filter.dat + +;#The following options are used in the filter design of Fir_Filter and Freq_Xlating_Fir_Filter implementation. +;#These options are based on parameters of gnuradio's function: gr_remez. +;#These function calculates the optimal (in the Chebyshev/minimax sense) FIR filter inpulse reponse given a set of band edges, the desired reponse on those bands, and the weight given to the error in those bands. + +;#input_item_type: Type and resolution for input signal samples. Use only gr_complex in this version. +InputFilter.input_item_type=gr_complex + +;#outut_item_type: Type and resolution for output filtered signal samples. Use only gr_complex in this version. +InputFilter.output_item_type=gr_complex + +;#taps_item_type: Type and resolution for the taps of the filter. Use only float in this version. +InputFilter.taps_item_type=float + +;#number_of_taps: Number of taps in the filter. Increasing this parameter increases the processing time +InputFilter.number_of_taps=5 + +;#number_of _bands: Number of frequency bands in the filter. +InputFilter.number_of_bands=2 + +;#bands: frequency at the band edges [ b1 e1 b2 e2 b3 e3 ...]. +;#Frequency is in the range [0, 1], with 1 being the Nyquist frequency (Fs/2) +;#The number of band_begin and band_end elements must match the number of bands + +InputFilter.band1_begin=0.0 +InputFilter.band1_end=0.45 +InputFilter.band2_begin=0.55 +InputFilter.band2_end=1.0 + +;#ampl: desired amplitude at the band edges [ a(b1) a(e1) a(b2) a(e2) ...]. +;#The number of ampl_begin and ampl_end elements must match the number of bands + +InputFilter.ampl1_begin=1.0 +InputFilter.ampl1_end=1.0 +InputFilter.ampl2_begin=0.0 +InputFilter.ampl2_end=0.0 + +;#band_error: weighting applied to each band (usually 1). +;#The number of band_error elements must match the number of bands +InputFilter.band1_error=1.0 +InputFilter.band2_error=1.0 + +;#filter_type: one of "bandpass", "hilbert" or "differentiator" +InputFilter.filter_type=bandpass + +;#grid_density: determines how accurately the filter will be constructed. +;The minimum value is 16; higher values are slower to compute the filter. +InputFilter.grid_density=16 + +;#The following options are used only in Freq_Xlating_Fir_Filter implementation. +;#InputFilter.IF is the intermediate frequency (in Hz) shifted down to zero Hz + +InputFilter.sampling_frequency=4000000 +InputFilter.IF=0 + + + +;######### RESAMPLER CONFIG ############ +;## Resamples the input data. + +;#implementation: Use [Pass_Through] or [Direct_Resampler] +;#[Pass_Through] disables this block +;#[Direct_Resampler] enables a resampler that implements a nearest neigbourhood interpolation +;Resampler.implementation=Direct_Resampler +Resampler.implementation=Pass_Through + +;#dump: Dump the resamplered data to a file. +Resampler.dump=false +;#dump_filename: Log path and filename. +Resampler.dump_filename=../data/resampler.dat + +;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version. +Resampler.item_type=gr_complex + +;#sample_freq_in: the sample frequency of the input signal +Resampler.sample_freq_in=8000000 + +;#sample_freq_out: the desired sample frequency of the output signal +Resampler.sample_freq_out=4000000 + + +;######### CHANNELS GLOBAL CONFIG ############ +;#count: Number of available GPS satellite channels. +Channels_GPS.count=8 +;#count: Number of available Galileo satellite channels. +Channels_Galileo.count=0 +;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver +Channels.in_acquisition=1 +;#system: GPS, GLONASS, GALILEO, SBAS or COMPASS +;#if the option is disabled by default is assigned GPS +Channel.system=GPS + +;#if the option is disabled by default is assigned "1C" GPS L1 C/A +Channel.signal=1C + + +;######### SPECIFIC CHANNELS CONFIG ###### +;#The following options are specific to each channel and overwrite the generic options + +;######### CHANNEL 0 CONFIG ############ + +;Channel0.system=GPS +;Channel0.signal=1C + +;#satellite: Satellite PRN ID for this channel. Disable this option to random search +;Channel0.satellite=11 + +;######### CHANNEL 1 CONFIG ############ + +;Channel1.system=GPS +;Channel1.signal=1C +;Channel1.satellite=18 + +;######### ACQUISITION GLOBAL CONFIG ############ + +;#dump: Enable or disable the acquisition internal data file logging [true] or [false] +Acquisition_GPS.dump=false +;#filename: Log path and filename +Acquisition_GPS.dump_filename=./acq_dump.dat +;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version. +Acquisition_GPS.item_type=gr_complex +;#if: Signal intermediate frequency in [Hz] +Acquisition_GPS.if=0 +;#sampled_ms: Signal block duration for the acquisition signal detection [ms] +Acquisition_GPS.sampled_ms=1 +;#implementation: Acquisition algorithm selection for this channel: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition] +Acquisition_GPS.implementation=GPS_L1_CA_PCPS_Acquisition +;#threshold: Acquisition threshold +Acquisition_GPS.threshold=0.005 +;#pfa: Acquisition false alarm probability. This option overrides the threshold option. Only use with implementations: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition] +;Acquisition_GPS.pfa=0.01 +;#doppler_max: Maximum expected Doppler shift [Hz] +Acquisition_GPS.doppler_max=10000 +;#doppler_max: Doppler step in the grid search [Hz] +Acquisition_GPS.doppler_step=500 + +;######### TRACKING GLOBAL CONFIG ############ + +;#implementation: Selected tracking algorithm: [GPS_L1_CA_DLL_PLL_Tracking] or [GPS_L1_CA_DLL_FLL_PLL_Tracking] or [GPS_L1_CA_TCP_CONNECTOR_Tracking] or [Galileo_E1_DLL_PLL_VEML_Tracking] +Tracking_GPS.implementation=GPS_L1_CA_DLL_PLL_Tracking_GPU +;#item_type: Type and resolution for each of the signal samples. Use only [gr_complex] in this version. +Tracking_GPS.item_type=gr_complex + +;#sampling_frequency: Signal Intermediate Frequency in [Hz] +Tracking_GPS.if=0 + +;#dump: Enable or disable the Tracking internal binary data file logging [true] or [false] +Tracking_GPS.dump=false + +;#dump_filename: Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number. +Tracking_GPS.dump_filename=../data/epl_tracking_ch_ + +;#pll_bw_hz: PLL loop filter bandwidth [Hz] +Tracking_GPS.pll_bw_hz=45.0; + +;#dll_bw_hz: DLL loop filter bandwidth [Hz] +Tracking_GPS.dll_bw_hz=2.0; + +;#fll_bw_hz: FLL loop filter bandwidth [Hz] +Tracking_GPS.fll_bw_hz=10.0; + +;#order: PLL/DLL loop filter order [2] or [3] +Tracking_GPS.order=3; + +;######### TELEMETRY DECODER GPS CONFIG ############ +;#implementation: Use [GPS_L1_CA_Telemetry_Decoder] for GPS L1 C/A +TelemetryDecoder_GPS.implementation=GPS_L1_CA_Telemetry_Decoder +TelemetryDecoder_GPS.dump=false +;#decimation factor +TelemetryDecoder_GPS.decimation_factor=1; + +;######### OBSERVABLES CONFIG ############ +;#implementation: Use [GPS_L1_CA_Observables] for GPS L1 C/A. +Observables.implementation=GPS_L1_CA_Observables + +;#dump: Enable or disable the Observables internal binary data file logging [true] or [false] +Observables.dump=false + +;#dump_filename: Log path and filename. +Observables.dump_filename=./observables.dat + + +;######### PVT CONFIG ############ +;#implementation: Position Velocity and Time (PVT) implementation algorithm: Use [GPS_L1_CA_PVT] in this version. +PVT.implementation=GPS_L1_CA_PVT + +;#averaging_depth: Number of PVT observations in the moving average algorithm +PVT.averaging_depth=100 + +;#flag_average: Enables the PVT averaging between output intervals (arithmetic mean) [true] or [false] +PVT.flag_averaging=false + +;#output_rate_ms: Period between two PVT outputs. Notice that the minimum period is equal to the tracking integration time (for GPS CA L1 is 1ms) [ms] +PVT.output_rate_ms=10 + +;#display_rate_ms: Position console print (std::out) interval [ms]. Notice that output_rate_ms<=display_rate_ms. +PVT.display_rate_ms=500 + +;# RINEX, KML, and NMEA output configuration + +;#dump_filename: Log path and filename without extension. Notice that PVT will add ".dat" to the binary dump and ".kml" to GoogleEarth dump. +PVT.dump_filename=./PVT + +;#nmea_dump_filename: NMEA log path and filename +PVT.nmea_dump_filename=./gnss_sdr_pvt.nmea; + +;#flag_nmea_tty_port: Enable or disable the NMEA log to a serial TTY port (Can be used with real hardware or virtual one) +PVT.flag_nmea_tty_port=false; + +;#nmea_dump_devname: serial device descriptor for NMEA logging +PVT.nmea_dump_devname=/dev/pts/4 + + +;#dump: Enable or disable the PVT internal binary data file logging [true] or [false] +PVT.dump=false + +;######### OUTPUT_FILTER CONFIG ############ +;# Receiver output filter: Leave this block disabled in this version +OutputFilter.implementation=Null_Sink_Output_Filter +OutputFilter.filename=data/gnss-sdr.dat +OutputFilter.item_type=gr_complex diff --git a/src/algorithms/tracking/adapters/CMakeLists.txt b/src/algorithms/tracking/adapters/CMakeLists.txt index fda0724a7..3e92ac60e 100644 --- a/src/algorithms/tracking/adapters/CMakeLists.txt +++ b/src/algorithms/tracking/adapters/CMakeLists.txt @@ -16,6 +16,9 @@ # along with GNSS-SDR. If not, see . # +if(ENABLE_CUDA) + FIND_PACKAGE(CUDA REQUIRED) +endif(ENABLE_CUDA) set(TRACKING_ADAPTER_SOURCES galileo_e1_dll_pll_veml_tracking.cc @@ -27,6 +30,7 @@ set(TRACKING_ADAPTER_SOURCES gps_l1_ca_tcp_connector_tracking.cc galileo_e5a_dll_pll_tracking.cc gps_l2_m_dll_pll_tracking.cc + gps_l1_ca_dll_pll_tracking_gpu.cc ) include_directories( @@ -40,6 +44,7 @@ include_directories( ${GLOG_INCLUDE_DIRS} ${GFlags_INCLUDE_DIRS} ${GNURADIO_RUNTIME_INCLUDE_DIRS} + ${CUDA_INCLUDE_DIRS} ) file(GLOB TRACKING_ADAPTER_HEADERS "*.h") diff --git a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.cc b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.cc new file mode 100644 index 000000000..4ae8864b0 --- /dev/null +++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.cc @@ -0,0 +1,158 @@ +/*! + * \file gps_l1_ca_dll_pll_tracking_gpu.cc + * \brief Implementation of an adapter of a DLL+PLL tracking loop block using GPU accelerated functions + * for GPS L1 C/A to a TrackingInterface + * \author Javier Arribas, 2015. jarribas(at)cttc.es + * + * Code DLL + carrier PLL according to the algorithms described in: + * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * A Software-Defined GPS and Galileo Receiver. A Single-Frequency + * Approach, Birkhauser, 2007 + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + + +#include "gps_l1_ca_dll_pll_tracking_gpu.h" +#include +#include "GPS_L1_CA.h" +#include "configuration_interface.h" + + +using google::LogMessage; + +GpsL1CaDllPllTrackingGPU::GpsL1CaDllPllTrackingGPU( + ConfigurationInterface* configuration, std::string role, + unsigned int in_streams, unsigned int out_streams, + boost::shared_ptr queue) : + role_(role), in_streams_(in_streams), out_streams_(out_streams), + queue_(queue) +{ + DLOG(INFO) << "role " << role; + //################# CONFIGURATION PARAMETERS ######################## + int fs_in; + int vector_length; + int f_if; + bool dump; + std::string dump_filename; + std::string item_type; + std::string default_item_type = "gr_complex"; + float pll_bw_hz; + float dll_bw_hz; + float early_late_space_chips; + item_type = configuration->property(role + ".item_type", default_item_type); + //vector_length = configuration->property(role + ".vector_length", 2048); + fs_in = configuration->property("GNSS-SDR.internal_fs_hz", 2048000); + f_if = configuration->property(role + ".if", 0); + dump = configuration->property(role + ".dump", false); + pll_bw_hz = configuration->property(role + ".pll_bw_hz", 50.0); + dll_bw_hz = configuration->property(role + ".dll_bw_hz", 2.0); + early_late_space_chips = configuration->property(role + ".early_late_space_chips", 0.5); + std::string default_dump_filename = "./track_ch"; + dump_filename = configuration->property(role + ".dump_filename", + default_dump_filename); //unused! + vector_length = std::round(fs_in / (GPS_L1_CA_CODE_RATE_HZ / GPS_L1_CA_CODE_LENGTH_CHIPS)); + + //################# MAKE TRACKING GNURadio object ################### + if (item_type.compare("gr_complex") == 0) + { + item_size_ = sizeof(gr_complex); + tracking_ = gps_l1_ca_dll_pll_make_tracking_gpu_cc( + f_if, + fs_in, + vector_length, + queue_, + dump, + dump_filename, + pll_bw_hz, + dll_bw_hz, + early_late_space_chips); + } + else + { + item_size_ = sizeof(gr_complex); + LOG(WARNING) << item_type << " unknown tracking item type."; + } + channel_ = 0; + channel_internal_queue_ = 0; + DLOG(INFO) << "tracking(" << tracking_->unique_id() << ")"; +} + + +GpsL1CaDllPllTrackingGPU::~GpsL1CaDllPllTrackingGPU() +{} + + +void GpsL1CaDllPllTrackingGPU::start_tracking() +{ + tracking_->start_tracking(); +} + +/* + * Set tracking channel unique ID + */ +void GpsL1CaDllPllTrackingGPU::set_channel(unsigned int channel) +{ + channel_ = channel; + tracking_->set_channel(channel); +} + +/* + * Set tracking channel internal queue + */ +void GpsL1CaDllPllTrackingGPU::set_channel_queue( + concurrent_queue *channel_internal_queue) +{ + channel_internal_queue_ = channel_internal_queue; + tracking_->set_channel_queue(channel_internal_queue_); +} + +void GpsL1CaDllPllTrackingGPU::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro) +{ + tracking_->set_gnss_synchro(p_gnss_synchro); +} + +void GpsL1CaDllPllTrackingGPU::connect(gr::top_block_sptr top_block) +{ + if(top_block) { /* top_block is not null */}; + //nothing to connect, now the tracking uses gr_sync_decimator +} + +void GpsL1CaDllPllTrackingGPU::disconnect(gr::top_block_sptr top_block) +{ + if(top_block) { /* top_block is not null */}; + //nothing to disconnect, now the tracking uses gr_sync_decimator +} + +gr::basic_block_sptr GpsL1CaDllPllTrackingGPU::get_left_block() +{ + return tracking_; +} + +gr::basic_block_sptr GpsL1CaDllPllTrackingGPU::get_right_block() +{ + return tracking_; +} + diff --git a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h new file mode 100644 index 000000000..22bbe5d15 --- /dev/null +++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h @@ -0,0 +1,113 @@ +/*! + * \file gps_l1_ca_dll_pll_tracking_gpu.h + * \brief Implementation of an adapter of a DLL+PLL tracking loop block using GPU accelerated functions + * for GPS L1 C/A to a TrackingInterface + * \author Javier Arribas, 2015. jarribas(at)cttc.es + * + * Code DLL + carrier PLL according to the algorithms described in: + * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * A Software-Defined GPS and Galileo Receiver. A Single-Frequency + * Approach, Birkha user, 2007 + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_H_ +#define GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_H_ + +#include +#include +#include "tracking_interface.h" +#include "gps_l1_ca_dll_pll_tracking_gpu_cc.h" + + +class ConfigurationInterface; + +/*! + * \brief This class implements a code DLL + carrier PLL tracking loop using GPU accelerated functions + */ +class GpsL1CaDllPllTrackingGPU : public TrackingInterface +{ +public: + + GpsL1CaDllPllTrackingGPU(ConfigurationInterface* configuration, + std::string role, + unsigned int in_streams, + unsigned int out_streams, + boost::shared_ptr queue); + + virtual ~GpsL1CaDllPllTrackingGPU(); + + std::string role() + { + return role_; + } + + //! Returns "GPS_L1_CA_DLL_PLL_Tracking" + std::string implementation() + { + return "GPS_L1_CA_DLL_PLL_Tracking_GPU"; + } + size_t item_size() + { + return item_size_; + } + + void connect(gr::top_block_sptr top_block); + void disconnect(gr::top_block_sptr top_block); + gr::basic_block_sptr get_left_block(); + gr::basic_block_sptr get_right_block(); + + + /*! + * \brief Set tracking channel unique ID + */ + void set_channel(unsigned int channel); + + /*! + * \brief Set acquisition/tracking common Gnss_Synchro object pointer + * to efficiently exchange synchronization data between acquisition and tracking blocks + */ + void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro); + + /*! + * \brief Set tracking channel internal queue + */ + void set_channel_queue(concurrent_queue *channel_internal_queue); + + void start_tracking(); + +private: + gps_l1_ca_dll_pll_tracking_gpu_cc_sptr tracking_; + size_t item_size_; + unsigned int channel_; + std::string role_; + unsigned int in_streams_; + unsigned int out_streams_; + boost::shared_ptr queue_; + concurrent_queue *channel_internal_queue_; +}; + +#endif // GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_H_ diff --git a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt index d300a9edd..8589832a7 100644 --- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt +++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt @@ -57,7 +57,8 @@ endif(ENABLE_GENERIC_ARCH) file(GLOB TRACKING_GR_BLOCKS_HEADERS "*.h") add_library(tracking_gr_blocks ${TRACKING_GR_BLOCKS_SOURCES} ${TRACKING_GR_BLOCKS_HEADERS}) source_group(Headers FILES ${TRACKING_GR_BLOCKS_HEADERS}) -target_link_libraries(tracking_gr_blocks tracking_lib ${GNURADIO_RUNTIME_LIBRARIES} gnss_sp_libs ${Boost_LIBRARIES} ${VOLK_GNSSSDR_LIBRARIES} ${ORC_LIBRARIES} ) + +target_link_libraries(tracking_gr_blocks tracking_lib ${GNURADIO_RUNTIME_LIBRARIES} gnss_sp_libs ${Boost_LIBRARIES} ${VOLK_GNSSSDR_LIBRARIES} ${ORC_LIBRARIES} ${CUDA_LIBRARIES}) if(NOT VOLK_GNSSSDR_FOUND) add_dependencies(tracking_gr_blocks volk_gnsssdr_module) endif(NOT VOLK_GNSSSDR_FOUND) diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc index b443aebf5..3de611135 100644 --- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc @@ -48,17 +48,11 @@ #include "GPS_L1_CA.h" #include "control_message_factory.h" #include //volk_alignement - -#include -// CUDA runtime -#include // includes #include #include // helper for shared functions common to CUDA Samples #include // helper functions for CUDA error checking and initialization - - /*! * \todo Include in definition header file */ @@ -130,8 +124,9 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc( multicorrelator_gpu = new cuda_multicorrelator(); - // Get space for the resampled early / prompt / late local replicas int N_CORRELATORS=3; + multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS); + // Get space for the resampled early / prompt / late local replicas checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_samples, N_CORRELATORS * sizeof(int), cudaHostAllocMapped )); @@ -323,9 +318,6 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc() multicorrelator_gpu->free_cuda(); delete(multicorrelator_gpu); - volk_free(d_Early); - volk_free(d_Prompt); - volk_free(d_Late); volk_free(d_ca_code); delete[] d_Prompt_buffer; @@ -381,18 +373,20 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto // UPDATE NCO COMMAND float phase_step_rad = static_cast(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast(d_fs_in); - + //std::cout<<"d_current_prn_length_samples="<Carrier_wipeoff_multicorrelator_cuda( d_corr_outs_gpu, - in,//in_gpu, + in, d_local_codes_gpu, d_rem_carr_phase_rad, phase_step_rad, d_local_code_shift_samples, d_current_prn_length_samples, 3); - + cudaProfilerStop(); + //std::cout<<"d_Prompt="<<*d_Prompt<<"d_Early="<<*d_Early<<"d_Late="<<*d_Late<); // input signal CPU -> GPU copy memory + checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize, cudaMemcpyHostToDevice, stream1)); @@ -362,7 +361,9 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); //wait for Doppler wipeoff end... - checkCudaErrors(cudaDeviceSynchronize()); + checkCudaErrors(cudaStreamSynchronize(stream1)); + checkCudaErrors(cudaStreamSynchronize(stream2)); + //checkCudaErrors(cudaDeviceSynchronize()); //old // scalarProdGPUCPXxN<<>>( @@ -385,16 +386,13 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( ); checkCudaErrors(cudaGetLastError()); //wait for correlators end... - checkCudaErrors(cudaDeviceSynchronize()); + checkCudaErrors(cudaStreamSynchronize(stream2)); // Copy the device result vector in device memory to the host result vector // in host memory. //scalar products (correlators outputs) - checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex)*n_correlators, - cudaMemcpyDeviceToHost, 0)); - - cudaStreamDestroy(stream1) ; - cudaStreamDestroy(stream2) ; + checkCudaErrors(cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex)*n_correlators, + cudaMemcpyDeviceToHost)); return true; } @@ -406,13 +404,16 @@ bool cuda_multicorrelator::free_cuda() cudaFree(d_local_codes_in); cudaFree(d_corr_out); + cudaStreamDestroy(stream1) ; + cudaStreamDestroy(stream2) ; + // Reset the device and exit // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits - checkCudaErrors(cudaDeviceReset()); + //checkCudaErrors(cudaDeviceReset()); return true; } diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h index 8819edf9a..e29cba53a 100644 --- a/src/algorithms/tracking/libs/cuda_multicorrelator.h +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h @@ -46,6 +46,10 @@ #include +#include +// CUDA runtime +#include + // GPU new internal data types for complex numbers struct GPU_Complex { @@ -109,7 +113,7 @@ struct GPU_Complex_Short { class cuda_multicorrelator { public: - bool init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators); + bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators); bool free_cuda(); bool Carrier_wipeoff_multicorrelator_cuda( @@ -132,6 +136,9 @@ private: int threadsPerBlock; int blocksPerGrid; + cudaStream_t stream1; + cudaStream_t stream2; + }; diff --git a/src/core/receiver/CMakeLists.txt b/src/core/receiver/CMakeLists.txt index 677efd793..cf4d18335 100644 --- a/src/core/receiver/CMakeLists.txt +++ b/src/core/receiver/CMakeLists.txt @@ -16,6 +16,10 @@ # along with GNSS-SDR. If not, see . # +if(ENABLE_CUDA) + FIND_PACKAGE(CUDA REQUIRED) +endif(ENABLE_CUDA) + set(GNSS_RECEIVER_SOURCES control_thread.cc control_message_factory.cc @@ -64,8 +68,9 @@ include_directories( ${GFlags_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS} ${GNURADIO_RUNTIME_INCLUDE_DIRS} + ${CUDA_INCLUDE_DIRS} ) - + if(Boost_VERSION LESS 105000) add_definitions(-DOLD_BOOST=1) endif(Boost_VERSION LESS 105000) diff --git a/src/core/receiver/gnss_block_factory.cc b/src/core/receiver/gnss_block_factory.cc index 770c1a6d3..35e4c8360 100644 --- a/src/core/receiver/gnss_block_factory.cc +++ b/src/core/receiver/gnss_block_factory.cc @@ -77,6 +77,7 @@ #include "galileo_e1_pcps_quicksync_ambiguous_acquisition.h" #include "galileo_e5a_noncoherent_iq_acquisition_caf.h" #include "gps_l1_ca_dll_pll_tracking.h" +#include "gps_l1_ca_dll_pll_tracking_gpu.h" #include "gps_l1_ca_dll_pll_optim_tracking.h" #include "gps_l1_ca_dll_fll_pll_tracking.h" #include "gps_l1_ca_tcp_connector_tracking.h" @@ -1609,6 +1610,12 @@ std::unique_ptr GNSSBlockFactory::GetTrkBlock( out_streams, queue)); block = std::move(block_); } + else if (implementation.compare("GPS_L1_CA_DLL_PLL_Tracking_GPU") == 0) + { + std::unique_ptr block_(new GpsL1CaDllPllTrackingGPU(configuration.get(), role, in_streams, + out_streams, queue)); + block = std::move(block_); + } else { // Log fatal. This causes execution to stop. diff --git a/src/main/CMakeLists.txt b/src/main/CMakeLists.txt index af8e7bcd0..02758105a 100644 --- a/src/main/CMakeLists.txt +++ b/src/main/CMakeLists.txt @@ -16,6 +16,7 @@ # along with GNSS-SDR. If not, see . # + set(GNSS_SDR_OPTIONAL_LIBS "") set(GNSS_SDR_OPTIONAL_HEADERS "") From fb2b12403a551669a70456918b9ddc066365e765 Mon Sep 17 00:00:00 2001 From: Javier Date: Fri, 24 Jul 2015 18:07:33 +0200 Subject: [PATCH 4/5] Multi-GPU device suport (splits the tracking channels between all the availables GPUs (CUDA only!)) --- .../tracking/libs/cuda_multicorrelator.cu | 65 ++++++++++++++----- .../tracking/libs/cuda_multicorrelator.h | 2 + 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu index 3f027cb3d..5f97ee280 100644 --- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu @@ -264,23 +264,54 @@ CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s - findCudaDevice(argc, (const char **)argv); +// findCudaDevice(argc, (const char **)argv); + cudaDeviceProp prop; + int num_devices, device; + cudaGetDeviceCount(&num_devices); + num_gpu_devices=num_devices; + if (num_devices > 1) { + int max_multiprocessors = 0, max_device = 0; + for (device = 0; device < num_devices; device++) { + cudaDeviceProp properties; + cudaGetDeviceProperties(&properties, device); + if (max_multiprocessors < properties.multiProcessorCount) { + max_multiprocessors = properties.multiProcessorCount; + max_device = device; + } + printf("Found GPU device # %i\n",device); + } + //set random device! + selected_device=(rand() % num_devices); + printf("selected_device=%i\n",selected_device); + cudaGetDeviceProperties( &prop, selected_device ); + //debug code + if (prop.canMapHostMemory != 1) { + printf( "Device can not map memory.\n" ); + } + printf("L2 Cache size= %u \n",prop.l2CacheSize); + printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); + printf("maxGridSize= %i \n",prop.maxGridSize[0]); + printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); + printf("deviceOverlap= %i \n",prop.deviceOverlap); + printf("multiProcessorCount= %i \n",prop.multiProcessorCount); + }else{ + selected_device=0; + int whichDevice; + cudaGetDevice( &whichDevice ); + cudaGetDeviceProperties( &prop, whichDevice ); + //debug code + if (prop.canMapHostMemory != 1) { + printf( "Device can not map memory.\n" ); + } - cudaDeviceProp prop; - int whichDevice; - cudaGetDevice( &whichDevice ); - cudaGetDeviceProperties( &prop, whichDevice ); - //debug code - if (prop.canMapHostMemory != 1) { - printf( "Device can not map memory.\n" ); + printf("L2 Cache size= %u \n",prop.l2CacheSize); + printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); + printf("maxGridSize= %i \n",prop.maxGridSize[0]); + printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); + printf("deviceOverlap= %i \n",prop.deviceOverlap); + printf("multiProcessorCount= %i \n",prop.multiProcessorCount); } - printf("L2 Cache size= %u \n",prop.l2CacheSize); - printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); - printf("maxGridSize= %i \n",prop.maxGridSize[0]); - printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); - printf("deviceOverlap= %i \n",prop.deviceOverlap); - //end debug code //checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); @@ -288,7 +319,8 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors size_t size = signal_length_samples * sizeof(GPU_Complex); - + cudaSetDevice(selected_device); //generates a random number between 0 and num_devices to split the threads between GPUs + checkCudaErrors(cudaMalloc((void **)&d_sig_in, size)); //checkCudaErrors(cudaMalloc((void **)&d_nco_in, size)); checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size)); @@ -327,7 +359,7 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( { size_t memSize = signal_length_samples * sizeof(std::complex); - + cudaSetDevice(selected_device); //generates a random number between 0 and num_devices to split the threads between GPUs // input signal CPU -> GPU copy memory checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize, @@ -398,6 +430,7 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( bool cuda_multicorrelator::free_cuda() { + cudaSetDevice(selected_device); //generates a random number between 0 and num_devices to split the threads between GPUs // Free device global memory cudaFree(d_sig_in); //cudaFree(d_nco_in); diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h index e29cba53a..1a0f61356 100644 --- a/src/algorithms/tracking/libs/cuda_multicorrelator.h +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h @@ -138,6 +138,8 @@ private: cudaStream_t stream1; cudaStream_t stream2; + int num_gpu_devices; + int selected_device; }; From 1aa84cd1c44440e58a8a126388ba8e05c52c37eb Mon Sep 17 00:00:00 2001 From: Javier Arribas Date: Thu, 6 Aug 2015 17:05:15 +0200 Subject: [PATCH 5/5] Updated CUDA kernels and several GPU tracking optimizations. Bug fix in GPS_L1_CA_DLL_PLL binary dump --- conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf | 8 +- ...nnel_GPS_L1_Flexiband_bin_file_III_1a.conf | 4 +- ...Galileo_E1B_Flexiband_realtime_III_1b.conf | 30 +- ...el_GPS_L2_M_Flexiband_bin_file_III_1a.conf | 3 +- .../signal_source/adapters/CMakeLists.txt | 2 +- .../tracking/adapters/CMakeLists.txt | 3 +- .../tracking/gnuradio_blocks/CMakeLists.txt | 3 +- .../gps_l1_ca_dll_pll_tracking_cc.cc | 3 +- .../gps_l1_ca_dll_pll_tracking_gpu_cc.cc | 124 ++---- .../gps_l1_ca_dll_pll_tracking_gpu_cc.h | 2 +- src/algorithms/tracking/libs/CMakeLists.txt | 1 - .../tracking/libs/cuda_multicorrelator.cu | 367 ++++++++++++++++-- .../tracking/libs/cuda_multicorrelator.h | 26 +- .../tracking/libs/tracking_2nd_PLL_filter.cc | 2 +- src/core/receiver/CMakeLists.txt | 1 + src/core/receiver/gnss_block_factory.cc | 2 + src/main/CMakeLists.txt | 8 + src/main/main.cc | 16 + 18 files changed, 450 insertions(+), 155 deletions(-) diff --git a/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf b/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf index a8e576df7..6bfc9bb6c 100644 --- a/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf +++ b/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf @@ -165,7 +165,7 @@ Resampler.sample_freq_out=4000000 ;######### CHANNELS GLOBAL CONFIG ############ ;#count: Number of available GPS satellite channels. -Channels_GPS.count=8 +Channels_GPS.count=1 ;#count: Number of available Galileo satellite channels. Channels_Galileo.count=0 ;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver @@ -229,16 +229,16 @@ Tracking_GPS.item_type=gr_complex Tracking_GPS.if=0 ;#dump: Enable or disable the Tracking internal binary data file logging [true] or [false] -Tracking_GPS.dump=false +Tracking_GPS.dump=true ;#dump_filename: Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number. Tracking_GPS.dump_filename=../data/epl_tracking_ch_ ;#pll_bw_hz: PLL loop filter bandwidth [Hz] -Tracking_GPS.pll_bw_hz=45.0; +Tracking_GPS.pll_bw_hz=55.0; ;#dll_bw_hz: DLL loop filter bandwidth [Hz] -Tracking_GPS.dll_bw_hz=2.0; +Tracking_GPS.dll_bw_hz=1.5 ;#fll_bw_hz: FLL loop filter bandwidth [Hz] Tracking_GPS.fll_bw_hz=10.0; diff --git a/conf/gnss-sdr_multichannel_GPS_L1_Flexiband_bin_file_III_1a.conf b/conf/gnss-sdr_multichannel_GPS_L1_Flexiband_bin_file_III_1a.conf index d412bb82d..3e835d2d3 100644 --- a/conf/gnss-sdr_multichannel_GPS_L1_Flexiband_bin_file_III_1a.conf +++ b/conf/gnss-sdr_multichannel_GPS_L1_Flexiband_bin_file_III_1a.conf @@ -29,13 +29,13 @@ GNSS-SDR.SUPL_CI=0x31b0 SignalSource.implementation=Flexiband_Signal_Source SignalSource.flag_read_file=true -SignalSource.signal_file=/datalogger/captures/eclipse/eclipse_IIIa_2.bin +SignalSource.signal_file=/datalogger/L125_III1b_210s.usb ;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version. SignalSource.item_type=gr_complex ;# FPGA firmware file -SignalSource.firmware_file=flexiband_III-1a.bit +SignalSource.firmware_file=flexiband_III-1b.bit ;#RF_channels: Number of RF channels present in the frontend device, must agree the FPGA firmware file SignalSource.RF_channels=1 diff --git a/conf/gnss-sdr_multichannel_GPS_L1_L2_Galileo_E1B_Flexiband_realtime_III_1b.conf b/conf/gnss-sdr_multichannel_GPS_L1_L2_Galileo_E1B_Flexiband_realtime_III_1b.conf index 0177fef62..019c0abcc 100644 --- a/conf/gnss-sdr_multichannel_GPS_L1_L2_Galileo_E1B_Flexiband_realtime_III_1b.conf +++ b/conf/gnss-sdr_multichannel_GPS_L1_L2_Galileo_E1B_Flexiband_realtime_III_1b.conf @@ -28,9 +28,9 @@ GNSS-SDR.SUPL_CI=0x31b0 ;#implementation: Use [File_Signal_Source] or [UHD_Signal_Source] or [GN3S_Signal_Source] (experimental) SignalSource.implementation=Flexiband_Signal_Source -SignalSource.flag_read_file=false -#SignalSource.signal_file=/datalogger/signals/Fraunhofer/L125_III1b_210s.usb -SignalSource.signal_file=/datalogger/captures/flexiband_III_1b_cap1.usb +SignalSource.flag_read_file=true +SignalSource.signal_file=/datalogger/L125_III1b_210s.usb +#SignalSource.signal_file=/datalogger/captures/flexiband_III_1b_cap1.usb ;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version. SignalSource.item_type=gr_complex @@ -136,8 +136,8 @@ InputFilter0.grid_density=16 InputFilter0.sampling_frequency=20000000 ;# IF deviation due to front-end LO inaccuracies [HZ] ;# WARNING: Fraunhofer front-end hardwareconfigurations can difer. Signals available on http://www.iis.fraunhofer.de/de/ff/lok/leist/test/flexiband.html are centered on 0 Hz, ALL BANDS. -InputFilter0.IF=-205000 -;#InputFilter0.IF=0 +;#InputFilter0.IF=-205000 +InputFilter0.IF=0 ;# Decimation factor after the frequency tranaslating block InputFilter0.decimation_factor=8 @@ -230,8 +230,8 @@ InputFilter1.grid_density=16 InputFilter1.sampling_frequency=20000000 ;# IF deviation due to front-end LO inaccuracies [HZ] ;# WARNING: Fraunhofer front-end hardwareconfigurations can difer. Signals available on http://www.iis.fraunhofer.de/de/ff/lok/leist/test/flexiband.html are centered on 0 Hz, ALL BANDS. -InputFilter1.IF=100000 -;#InputFilter1.IF=0 +;#InputFilter1.IF=100000 +InputFilter1.IF=0 ;# Decimation factor after the frequency tranaslating block InputFilter1.decimation_factor=8 @@ -272,7 +272,7 @@ Resampler2.implementation=Pass_Through ;#count: Number of available GPS satellite channels. Channels_1C.count=8 Channels_1B.count=1 -Channels_2S.count=8 +Channels_2S.count=1 ;#count: Number of available Galileo satellite channels. ;Channels_Galileo.count=0 ;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver @@ -378,13 +378,13 @@ Acquisition_1C.max_dwells=1 ;#implementation: Selected tracking algorithm: [GPS_L1_CA_DLL_PLL_Tracking] or [GPS_L1_CA_DLL_FLL_PLL_Tracking] -Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_Tracking +Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_Tracking_GPU Tracking_1C.item_type=gr_complex Tracking_1C.if=0 -Tracking_1C.dump=true -Tracking_1C.dump_filename=./tracking_ch_ +Tracking_1C.dump=false +Tracking_1C.dump_filename=../data/epl_tracking_ch_ Tracking_1C.pll_bw_hz=40.0; -Tracking_1C.dll_bw_hz=3.0; +Tracking_1C.dll_bw_hz=1.5; Tracking_1C.fll_bw_hz=10.0; Tracking_1C.order=3; Tracking_1C.early_late_space_chips=0.5; @@ -405,7 +405,7 @@ Acquisition_2S.max_dwells=1 Tracking_2S.implementation=GPS_L2_M_DLL_PLL_Tracking Tracking_2S.item_type=gr_complex Tracking_2S.if=0 -Tracking_2S.dump=true +Tracking_2S.dump=false Tracking_2S.dump_filename=./tracking_ch_ Tracking_2S.pll_bw_hz=1.5; Tracking_2S.dll_bw_hz=0.3; @@ -447,7 +447,7 @@ Tracking_1B.item_type=gr_complex Tracking_1B.if=0 ;#dump: Enable or disable the Tracking internal binary data file logging [true] or [false] -Tracking_1B.dump=true +Tracking_1B.dump=false ;#dump_filename: Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number. Tracking_1B.dump_filename=./veml_tracking_ch_ @@ -497,7 +497,7 @@ TelemetryDecoder_1B.decimation_factor=5; Observables.implementation=Mixed_Observables ;#dump: Enable or disable the Observables internal binary data file logging [true] or [false] -Observables.dump=true +Observables.dump=false ;#dump_filename: Log path and filename. Observables.dump_filename=./observables.dat diff --git a/conf/gnss-sdr_multichannel_GPS_L2_M_Flexiband_bin_file_III_1a.conf b/conf/gnss-sdr_multichannel_GPS_L2_M_Flexiband_bin_file_III_1a.conf index 4b26ea9e4..debff1ae1 100644 --- a/conf/gnss-sdr_multichannel_GPS_L2_M_Flexiband_bin_file_III_1a.conf +++ b/conf/gnss-sdr_multichannel_GPS_L2_M_Flexiband_bin_file_III_1a.conf @@ -135,7 +135,8 @@ InputFilter0.grid_density=16 ; i.e. using front-end-cal as reported here:http://www.cttc.es/publication/turning-a-television-into-a-gnss-receiver/ InputFilter0.sampling_frequency=20000000 ;# IF deviation due to front-end LO inaccuracies [HZ] -InputFilter0.IF=-205000 +;#InputFilter0.IF=-205000 +InputFilter0.IF=0 ;# Decimation factor after the frequency tranaslating block InputFilter0.decimation_factor=4 diff --git a/src/algorithms/signal_source/adapters/CMakeLists.txt b/src/algorithms/signal_source/adapters/CMakeLists.txt index e2775865e..f7b533b40 100644 --- a/src/algorithms/signal_source/adapters/CMakeLists.txt +++ b/src/algorithms/signal_source/adapters/CMakeLists.txt @@ -58,7 +58,7 @@ if(ENABLE_FLEXIBAND) if(OS_IS_MACOSX) set(MACOSX_ARGS "-DCMAKE_CXX_COMPILER=/usr/bin/clang++") endif(OS_IS_MACOSX) - find_package(teleorbit REQUIRED) + find_package(Teleorbit REQUIRED) if(NOT TELEORBIT_FOUND) message(FATAL_ERROR "Teleorbit Flexiband GNURadio driver required to build gnss-sdr with the optional FLEXIBAND adapter") endif(NOT TELEORBIT_FOUND) diff --git a/src/algorithms/tracking/adapters/CMakeLists.txt b/src/algorithms/tracking/adapters/CMakeLists.txt index 3e92ac60e..c712cd981 100644 --- a/src/algorithms/tracking/adapters/CMakeLists.txt +++ b/src/algorithms/tracking/adapters/CMakeLists.txt @@ -18,6 +18,7 @@ if(ENABLE_CUDA) FIND_PACKAGE(CUDA REQUIRED) + set(OPT_TRACKING_ADAPTERS ${OPT_TRACKING_ADAPTERS} gps_l1_ca_dll_pll_tracking_gpu.cc) endif(ENABLE_CUDA) set(TRACKING_ADAPTER_SOURCES @@ -30,7 +31,7 @@ set(TRACKING_ADAPTER_SOURCES gps_l1_ca_tcp_connector_tracking.cc galileo_e5a_dll_pll_tracking.cc gps_l2_m_dll_pll_tracking.cc - gps_l1_ca_dll_pll_tracking_gpu.cc + ${OPT_TRACKING_ADAPTERS} ) include_directories( diff --git a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt index 8589832a7..a018fe10e 100644 --- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt +++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt @@ -19,6 +19,7 @@ if(ENABLE_CUDA) FIND_PACKAGE(CUDA REQUIRED) + set(OPT_TRACKING_BLOCKS ${OPT_TRACKING_BLOCKS} gps_l1_ca_dll_pll_tracking_gpu_cc.cc) endif(ENABLE_CUDA) set(TRACKING_GR_BLOCKS_SOURCES @@ -31,7 +32,7 @@ set(TRACKING_GR_BLOCKS_SOURCES gps_l1_ca_tcp_connector_tracking_cc.cc galileo_e5a_dll_pll_tracking_cc.cc gps_l2_m_dll_pll_tracking_cc.cc - gps_l1_ca_dll_pll_tracking_gpu_cc.cc + ${OPT_TRACKING_BLOCKS} ) include_directories( diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc index 85d63f54c..3193d335e 100644 --- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc @@ -594,7 +594,8 @@ int Gps_L1_Ca_Dll_Pll_Tracking_cc::general_work (int noutput_items, gr_vector_in // carrier and code frequency d_dump_file.write(reinterpret_cast(&d_carrier_doppler_hz), sizeof(float)); - d_dump_file.write(reinterpret_cast(&d_code_freq_chips), sizeof(float)); + tmp_float=d_code_freq_chips; + d_dump_file.write(reinterpret_cast(&tmp_float), sizeof(float)); //PLL commands d_dump_file.write(reinterpret_cast(&carr_error_hz), sizeof(float)); diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc index 3de611135..1cf5d038e 100644 --- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc @@ -81,7 +81,6 @@ gps_l1_ca_dll_pll_make_tracking_gpu_cc( } - void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::forecast (int noutput_items, gr_vector_int &ninput_items_required) { @@ -120,14 +119,19 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc( // Initialization of local code replica // Get space for a vector with the C/A code replica sampled 1x/chip - d_ca_code = static_cast(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS + 2) * sizeof(gr_complex), volk_get_alignment())); - + //d_ca_code = static_cast(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS + 2) * sizeof(gr_complex), volk_get_alignment())); + d_ca_code = static_cast(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment())); multicorrelator_gpu = new cuda_multicorrelator(); int N_CORRELATORS=3; - multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS); + //local code resampler on CPU (old) + //multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS); + + //local code resampler on GPU (new) + multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS); + // Get space for the resampled early / prompt / late local replicas - checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_samples, N_CORRELATORS * sizeof(int), cudaHostAllocMapped )); + checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float), cudaHostAllocMapped )); //allocate host memory @@ -138,7 +142,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc( //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined )); //new integrated shifts - checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined )); + //checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined )); // correlator outputs (scalar) checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS, cudaHostAllocWriteCombined )); @@ -242,9 +246,13 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking() d_code_loop_filter.initialize(); // initialize the code filter // generate local reference ALWAYS starting at chip 1 (1 sample per chip) - gps_l1_ca_code_gen_complex(&d_ca_code[1], d_acquisition_gnss_synchro->PRN, 0); - d_ca_code[0] = d_ca_code[static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS)]; - d_ca_code[static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS) + 1] = d_ca_code[1]; + gps_l1_ca_code_gen_complex(d_ca_code, d_acquisition_gnss_synchro->PRN, 0); + + d_local_code_shift_chips[0]=-d_early_late_spc_chips; + d_local_code_shift_chips[1]=0.0; + d_local_code_shift_chips[2]=d_early_late_spc_chips; + + multicorrelator_gpu->set_local_code_and_taps(GPS_L1_CA_CODE_LENGTH_CHIPS,d_ca_code, d_local_code_shift_chips,3); d_carrier_lock_fail_counter = 0; d_rem_code_phase_samples = 0; @@ -272,40 +280,6 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking() } - - - -void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::update_local_code() -{ - double tcode_chips; - double rem_code_phase_chips; - int associated_chip_index; - int code_length_chips = static_cast(GPS_L1_CA_CODE_LENGTH_CHIPS); - double code_phase_step_chips; - int epl_loop_length_samples; - - // unified loop for E, P, L code vectors - code_phase_step_chips = static_cast(d_code_freq_chips) / static_cast(d_fs_in); - rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in); - tcode_chips = -rem_code_phase_chips; - - // Alternative EPL code generation (40% of speed improvement!) - d_local_code_shift_samples[0]=0; - d_local_code_shift_samples[1]=round(d_early_late_spc_chips / code_phase_step_chips); - d_local_code_shift_samples[2]=round((2*d_early_late_spc_chips) / code_phase_step_chips); - - epl_loop_length_samples = d_current_prn_length_samples + d_local_code_shift_samples[2]; //maximum length - - for (int i = 0; i < epl_loop_length_samples; i++) - { - associated_chip_index = 1 + round(fmod(tcode_chips - d_early_late_spc_chips, code_length_chips)); - d_local_codes_gpu[i] = d_ca_code[associated_chip_index]; - tcode_chips = tcode_chips + code_phase_step_chips; - } - -} - - Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc() { d_dump_file.close(); @@ -313,7 +287,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc() cudaFreeHost(in_gpu); cudaFreeHost(d_carr_sign_gpu); cudaFreeHost(d_corr_outs_gpu); - cudaFreeHost(d_local_codes_gpu); + cudaFreeHost(d_local_code_shift_chips); multicorrelator_gpu->free_cuda(); delete(multicorrelator_gpu); @@ -329,10 +303,10 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto gr_vector_const_void_star &input_items, gr_vector_void_star &output_items) { // process vars - float carr_error_hz; - float carr_error_filt_hz; - float code_error_chips; - float code_error_filt_chips; + float carr_error_hz=0.0; + float carr_error_filt_hz=0.0; + float code_error_chips=0.0; + float code_error_filt_chips=0.0; // Block input data and block output stream pointers const gr_complex* in = (gr_complex*) input_items[0]; @@ -341,23 +315,17 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder Gnss_Synchro current_synchro_data = Gnss_Synchro(); - if (d_enable_tracking == true) { // Receiver signal alignment if (d_pull_in == true) { int samples_offset; - float acq_trk_shif_correction_samples; int acq_to_trk_delay_samples; acq_to_trk_delay_samples = d_sample_counter - d_acq_sample_stamp; - acq_trk_shif_correction_samples = d_current_prn_length_samples - fmod(static_cast(acq_to_trk_delay_samples), static_cast(d_current_prn_length_samples)); - samples_offset = round(d_acq_code_phase_samples + acq_trk_shif_correction_samples); - // /todo: Check if the sample counter sent to the next block as a time reference should be incremented AFTER sended or BEFORE - //d_sample_counter_seconds = d_sample_counter_seconds + (((double)samples_offset) / static_cast(d_fs_in)); + samples_offset = round(d_acq_code_phase_samples)+d_current_prn_length_samples - acq_to_trk_delay_samples%d_current_prn_length_samples; d_sample_counter = d_sample_counter + samples_offset; //count for the processed samples d_pull_in = false; - //std::cout<<" samples_offset="<(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast(d_fs_in); - //std::cout<<"d_current_prn_length_samples="<Carrier_wipeoff_multicorrelator_cuda( + + //code resampler on GPU (new) + float code_phase_step_chips = static_cast(d_code_freq_chips) / static_cast(d_fs_in); + float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in); + + cudaProfilerStart(); + multicorrelator_gpu->Carrier_wipeoff_multicorrelator_resampler_cuda( d_corr_outs_gpu, in, - d_local_codes_gpu, d_rem_carr_phase_rad, phase_step_rad, - d_local_code_shift_samples, + code_phase_step_chips, + rem_code_phase_chips, d_current_prn_length_samples, 3); cudaProfilerStop(); - //std::cout<<"d_Prompt="<<*d_Prompt<<"d_Early="<<*d_Early<<"d_Late="<<*d_Late<(d_sample_counter) / static_cast(d_fs_in); - current_synchro_data.Carrier_phase_rads = 0.0; - current_synchro_data.Code_phase_secs = 0.0; - current_synchro_data.CN0_dB_hz = 0.0; - current_synchro_data.Flag_valid_tracking = false; - current_synchro_data.Flag_valid_pseudorange = false; - - *out[0] = current_synchro_data; - return 1; - } // ################## PLL ########################################################## // PLL discriminator @@ -444,8 +390,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto T_chip_seconds = 1 / static_cast(d_code_freq_chips); T_prn_seconds = T_chip_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS; T_prn_samples = T_prn_seconds * static_cast(d_fs_in); - K_blk_samples = T_prn_samples + d_rem_code_phase_samples + code_error_filt_secs * static_cast(d_fs_in); - d_current_prn_length_samples = round(K_blk_samples); //round to a discrete samples + K_blk_samples = T_prn_samples + d_rem_code_phase_samples + static_cast(code_error_filt_secs) * static_cast(d_fs_in); //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample // ####### CN0 ESTIMATION AND LOCK DETECTORS ###### @@ -591,7 +536,8 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto // carrier and code frequency d_dump_file.write(reinterpret_cast(&d_carrier_doppler_hz), sizeof(float)); - d_dump_file.write(reinterpret_cast(&d_code_freq_chips), sizeof(float)); + tmp_float=d_code_freq_chips; + d_dump_file.write(reinterpret_cast(&tmp_float), sizeof(float)); //PLL commands d_dump_file.write(reinterpret_cast(&carr_error_hz), sizeof(float)); diff --git a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h index cf166fcff..644751ed7 100644 --- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h +++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h @@ -130,7 +130,7 @@ private: gr_complex* in_gpu; gr_complex* d_carr_sign_gpu; gr_complex* d_local_codes_gpu; - int* d_local_code_shift_samples; + float* d_local_code_shift_chips; gr_complex* d_corr_outs_gpu; cuda_multicorrelator *multicorrelator_gpu; diff --git a/src/algorithms/tracking/libs/CMakeLists.txt b/src/algorithms/tracking/libs/CMakeLists.txt index 175646aaf..e6f66fa68 100644 --- a/src/algorithms/tracking/libs/CMakeLists.txt +++ b/src/algorithms/tracking/libs/CMakeLists.txt @@ -33,7 +33,6 @@ if(ENABLE_CUDA) SET(LIB_TYPE STATIC) #set the lib type CUDA_ADD_LIBRARY(CUDA_CORRELATOR_LIB ${LIB_TYPE} cuda_multicorrelator.h cuda_multicorrelator.cu) - endif(ENABLE_CUDA) diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.cu b/src/algorithms/tracking/libs/cuda_multicorrelator.cu index 3f027cb3d..166bca3c9 100644 --- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu @@ -53,7 +53,83 @@ #include #include -#define ACCUM_N 1024 +#define ACCUM_N 256 + + +__global__ void scalarProdGPUCPXxN_shifts_chips( + GPU_Complex *d_corr_out, + GPU_Complex *d_sig_in, + GPU_Complex *d_local_code_in, + float *d_shifts_chips, + float code_length_chips, + float code_phase_step_chips, + float rem_code_phase_chips, + int vectorN, + int elementN +) +{ + //Accumulators cache + __shared__ GPU_Complex accumResult[ACCUM_N]; + + //////////////////////////////////////////////////////////////////////////// + // Cycle through every pair of vectors, + // taking into account that vector counts can be different + // from total number of thread blocks + //////////////////////////////////////////////////////////////////////////// + for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) + { + //int vectorBase = IMUL(elementN, vec); + //int vectorEnd = elementN; + + //////////////////////////////////////////////////////////////////////// + // Each accumulator cycles through vectors with + // stride equal to number of total number of accumulators ACCUM_N + // At this stage ACCUM_N is only preferred be a multiple of warp size + // to meet memory coalescing alignment constraints. + //////////////////////////////////////////////////////////////////////// + for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x) + { + GPU_Complex sum = GPU_Complex(0,0); + + for (int pos = iAccum; pos < elementN; pos += ACCUM_N) + { + //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; + //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; + //sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]); + + // 1.resample local code for the current shift + float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips); + //TODO: Take into account that in multitap correlators, the shifts can be negative! + if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips; + + // 2.correlate + sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]); + + } + accumResult[iAccum] = sum; + } + + //////////////////////////////////////////////////////////////////////// + // Perform tree-like reduction of accumulators' results. + // ACCUM_N has to be power of two at this stage + //////////////////////////////////////////////////////////////////////// + for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1) + { + __syncthreads(); + + for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) + { + accumResult[iAccum] += accumResult[stride + iAccum]; + } + } + + if (threadIdx.x == 0) + { + d_corr_out[vec] = accumResult[0]; + } + } +} + /////////////////////////////////////////////////////////////////////////////// // Calculate scalar products of VectorN vectors of ElementN elements on GPU @@ -145,8 +221,9 @@ __global__ void scalarProdGPUCPXxN( //////////////////////////////////////////////////////////////////////////// for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x) { - int vectorBase = IMUL(elementN, vec); - int vectorEnd = vectorBase + elementN; + //int vectorBase = IMUL(elementN, vec); + //int vectorEnd = vectorBase + elementN; + //////////////////////////////////////////////////////////////////////// // Each accumulator cycles through vectors with @@ -158,11 +235,13 @@ __global__ void scalarProdGPUCPXxN( { GPU_Complex sum = GPU_Complex(0,0); - for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) + //for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N) + for (int pos = iAccum; pos < elementN; pos += ACCUM_N) { //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos]; //sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos]; - sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]); + //sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]); + sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos]); } accumResult[iAccum] = sum; } @@ -200,9 +279,9 @@ __global__ void scalarProdGPUCPXxN( */ __global__ void CUDA_32fc_x2_multiply_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - - if (i < numElements) + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < numElements; + i += blockDim.x * gridDim.x) { C[i] = A[i] * B[i]; } @@ -232,10 +311,11 @@ CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem // CUDA version of floating point NCO and vector dot product integrated - int i = blockDim.x * blockIdx.x + threadIdx.x; float sin; float cos; - if (i < numElements) + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < numElements; + i += blockDim.x * gridDim.x) { __sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos); sig_out[i] = sig_in[i] * GPU_Complex(cos,-sin); @@ -252,11 +332,11 @@ CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem __global__ void CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements) { - int i = blockDim.x * blockIdx.x + threadIdx.x; - - if (i < numElements) + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < numElements; + i += blockDim.x * gridDim.x) { - C[i] = A[i] * B[i]; + C[i] = A[i] + B[i]; } } @@ -264,23 +344,53 @@ CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators) { // use command-line specified CUDA device, otherwise use device with highest Gflops/s - findCudaDevice(argc, (const char **)argv); - - cudaDeviceProp prop; - int whichDevice; - cudaGetDevice( &whichDevice ); - cudaGetDeviceProperties( &prop, whichDevice ); - //debug code - if (prop.canMapHostMemory != 1) { - printf( "Device can not map memory.\n" ); - } - - printf("L2 Cache size= %u \n",prop.l2CacheSize); - printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); - printf("maxGridSize= %i \n",prop.maxGridSize[0]); - printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); - printf("deviceOverlap= %i \n",prop.deviceOverlap); - //end debug code +// findCudaDevice(argc, (const char **)argv); +// cudaDeviceProp prop; +// int num_devices, device; +// cudaGetDeviceCount(&num_devices); +// if (num_devices > 1) { +// int max_multiprocessors = 0, max_device = 0; +// for (device = 0; device < num_devices; device++) { +// cudaDeviceProp properties; +// cudaGetDeviceProperties(&properties, device); +// if (max_multiprocessors < properties.multiProcessorCount) { +// max_multiprocessors = properties.multiProcessorCount; +// max_device = device; +// } +// printf("Found GPU device # %i\n",device); +// } +// //cudaSetDevice(max_device); +// +// //set random device! +// cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs +// +// cudaGetDeviceProperties( &prop, max_device ); +// //debug code +// if (prop.canMapHostMemory != 1) { +// printf( "Device can not map memory.\n" ); +// } +// printf("L2 Cache size= %u \n",prop.l2CacheSize); +// printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); +// printf("maxGridSize= %i \n",prop.maxGridSize[0]); +// printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); +// printf("deviceOverlap= %i \n",prop.deviceOverlap); +// printf("multiProcessorCount= %i \n",prop.multiProcessorCount); +// }else{ +// int whichDevice; +// cudaGetDevice( &whichDevice ); +// cudaGetDeviceProperties( &prop, whichDevice ); +// //debug code +// if (prop.canMapHostMemory != 1) { +// printf( "Device can not map memory.\n" ); +// } +// +// printf("L2 Cache size= %u \n",prop.l2CacheSize); +// printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); +// printf("maxGridSize= %i \n",prop.maxGridSize[0]); +// printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); +// printf("deviceOverlap= %i \n",prop.deviceOverlap); +// printf("multiProcessorCount= %i \n",prop.multiProcessorCount); +// } //checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); @@ -300,7 +410,7 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign // Required: The last correlator tap in d_shifts_samples has the largest sample shift size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex); checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes)); - checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, size+sizeof(int)*n_correlators)); + checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators)); //scalars checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators)); @@ -315,6 +425,116 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign } +bool cuda_multicorrelator::init_cuda_integrated_resampler( + const int argc, const char **argv, + int signal_length_samples, + int code_length_chips, + int n_correlators + ) +{ + // use command-line specified CUDA device, otherwise use device with highest Gflops/s +// findCudaDevice(argc, (const char **)argv); +// cudaDeviceProp prop; +// int num_devices, device; +// cudaGetDeviceCount(&num_devices); +// if (num_devices > 1) { +// int max_multiprocessors = 0, max_device = 0; +// for (device = 0; device < num_devices; device++) { +// cudaDeviceProp properties; +// cudaGetDeviceProperties(&properties, device); +// if (max_multiprocessors < properties.multiProcessorCount) { +// max_multiprocessors = properties.multiProcessorCount; +// max_device = device; +// } +// printf("Found GPU device # %i\n",device); +// } +// //cudaSetDevice(max_device); +// +// //set random device! +// cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs +// +// cudaGetDeviceProperties( &prop, max_device ); +// //debug code +// if (prop.canMapHostMemory != 1) { +// printf( "Device can not map memory.\n" ); +// } +// printf("L2 Cache size= %u \n",prop.l2CacheSize); +// printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); +// printf("maxGridSize= %i \n",prop.maxGridSize[0]); +// printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); +// printf("deviceOverlap= %i \n",prop.deviceOverlap); +// printf("multiProcessorCount= %i \n",prop.multiProcessorCount); +// }else{ +// int whichDevice; +// cudaGetDevice( &whichDevice ); +// cudaGetDeviceProperties( &prop, whichDevice ); +// //debug code +// if (prop.canMapHostMemory != 1) { +// printf( "Device can not map memory.\n" ); +// } +// +// printf("L2 Cache size= %u \n",prop.l2CacheSize); +// printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock); +// printf("maxGridSize= %i \n",prop.maxGridSize[0]); +// printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock); +// printf("deviceOverlap= %i \n",prop.deviceOverlap); +// printf("multiProcessorCount= %i \n",prop.multiProcessorCount); +// } + + //checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared)); + + // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors + + size_t size = signal_length_samples * sizeof(GPU_Complex); + + checkCudaErrors(cudaMalloc((void **)&d_sig_in, size)); + checkCudaErrors(cudaMemset(d_sig_in,0,size)); + + //checkCudaErrors(cudaMalloc((void **)&d_nco_in, size)); + checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size)); + checkCudaErrors(cudaMemset(d_sig_doppler_wiped,0,size)); + + checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex)*code_length_chips)); + checkCudaErrors(cudaMemset(d_local_codes_in,0,sizeof(std::complex)*code_length_chips)); + + d_code_length_chips=code_length_chips; + + checkCudaErrors(cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators)); + checkCudaErrors(cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators)); + + //scalars + checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex)*n_correlators)); + checkCudaErrors(cudaMemset(d_corr_out,0,sizeof(std::complex)*n_correlators)); + + // Launch the Vector Add CUDA Kernel + threadsPerBlock = 256; + blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock; + + cudaStreamCreate (&stream1) ; + cudaStreamCreate (&stream2) ; + return true; +} + +bool cuda_multicorrelator::set_local_code_and_taps( + int code_length_chips, + const std::complex* local_codes_in, + float *shifts_chips, + int n_correlators + ) +{ + // local code CPU -> GPU copy memory + checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1)); + d_code_length_chips=(float)code_length_chips; + + // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!) + checkCudaErrors(cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators, + cudaMemcpyHostToDevice,stream1)); + + return true; +} + + + bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( std::complex* corr_out, const std::complex* sig_in, @@ -396,13 +616,88 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda( return true; } +bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda( + std::complex* corr_out, + const std::complex* sig_in, + float rem_carrier_phase_in_rad, + float phase_step_rad, + float code_phase_step_chips, + float rem_code_phase_chips, + int signal_length_samples, + int n_correlators) + { + + size_t memSize = signal_length_samples * sizeof(std::complex); + // input signal CPU -> GPU copy memory + checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize, + cudaMemcpyHostToDevice, stream2)); + + //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! **** + + //Launch carrier wipe-off kernel here, while local codes are being copied to GPU! + checkCudaErrors(cudaStreamSynchronize(stream2)); + + CUDA_32fc_Doppler_wipeoff<<>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples); + + //wait for Doppler wipeoff end... + checkCudaErrors(cudaStreamSynchronize(stream1)); + checkCudaErrors(cudaStreamSynchronize(stream2)); + + //launch the multitap correlator with integrated local code resampler! + + scalarProdGPUCPXxN_shifts_chips<<>>( + d_corr_out, + d_sig_doppler_wiped, + d_local_codes_in, + d_shifts_chips, + d_code_length_chips, + code_phase_step_chips, + rem_code_phase_chips, + n_correlators, + signal_length_samples + ); + + checkCudaErrors(cudaGetLastError()); + //wait for correlators end... + checkCudaErrors(cudaStreamSynchronize(stream1)); + // Copy the device result vector in device memory to the host result vector + // in host memory. + + //scalar products (correlators outputs) + checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex)*n_correlators, + cudaMemcpyDeviceToHost,stream1)); + checkCudaErrors(cudaStreamSynchronize(stream1)); + return true; +} + + +cuda_multicorrelator::cuda_multicorrelator() +{ + d_sig_in=NULL; + d_nco_in=NULL; + d_sig_doppler_wiped=NULL; + d_local_codes_in=NULL; + d_shifts_samples=NULL; + d_shifts_chips=NULL; + d_corr_out=NULL; + threadsPerBlock=0; + blocksPerGrid=0; + d_code_length_chips=0; +} + bool cuda_multicorrelator::free_cuda() { // Free device global memory - cudaFree(d_sig_in); - //cudaFree(d_nco_in); - cudaFree(d_local_codes_in); - cudaFree(d_corr_out); + if (d_sig_in!=NULL) cudaFree(d_sig_in); + if (d_nco_in!=NULL) cudaFree(d_nco_in); + if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped); + if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in); + if (d_corr_out!=NULL) cudaFree(d_corr_out); + + + if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples); + if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips); + cudaStreamDestroy(stream1) ; cudaStreamDestroy(stream2) ; diff --git a/src/algorithms/tracking/libs/cuda_multicorrelator.h b/src/algorithms/tracking/libs/cuda_multicorrelator.h index e29cba53a..e9bd1357f 100644 --- a/src/algorithms/tracking/libs/cuda_multicorrelator.h +++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h @@ -113,8 +113,20 @@ struct GPU_Complex_Short { class cuda_multicorrelator { public: + cuda_multicorrelator(); bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators); - + bool init_cuda_integrated_resampler( + const int argc, const char **argv, + int signal_length_samples, + int code_length_chips, + int n_correlators + ); + bool set_local_code_and_taps( + int code_length_chips, + const std::complex* local_codes_in, + float *shifts_chips, + int n_correlators + ); bool free_cuda(); bool Carrier_wipeoff_multicorrelator_cuda( std::complex* corr_out, @@ -125,6 +137,15 @@ public: const int *shifts_samples, int signal_length_samples, int n_correlators); + bool Carrier_wipeoff_multicorrelator_resampler_cuda( + std::complex* corr_out, + const std::complex* sig_in, + float rem_carrier_phase_in_rad, + float phase_step_rad, + float code_phase_step_chips, + float rem_code_phase_chips, + int signal_length_samples, + int n_correlators); private: // Allocate the device input vectors GPU_Complex *d_sig_in; @@ -133,6 +154,9 @@ private: GPU_Complex *d_local_codes_in; GPU_Complex *d_corr_out; int *d_shifts_samples; + float *d_shifts_chips; + float d_code_length_chips; + int threadsPerBlock; int blocksPerGrid; diff --git a/src/algorithms/tracking/libs/tracking_2nd_PLL_filter.cc b/src/algorithms/tracking/libs/tracking_2nd_PLL_filter.cc index 043d370da..55a706a33 100644 --- a/src/algorithms/tracking/libs/tracking_2nd_PLL_filter.cc +++ b/src/algorithms/tracking/libs/tracking_2nd_PLL_filter.cc @@ -94,7 +94,7 @@ Tracking_2nd_PLL_filter::Tracking_2nd_PLL_filter () { //--- PLL variables -------------------------------------------------------- d_pdi_carr = 0.001;// Summation interval for carrier - d_plldampingratio = 0.65; + d_plldampingratio = 0.7; } diff --git a/src/core/receiver/CMakeLists.txt b/src/core/receiver/CMakeLists.txt index cf4d18335..a3c91b432 100644 --- a/src/core/receiver/CMakeLists.txt +++ b/src/core/receiver/CMakeLists.txt @@ -18,6 +18,7 @@ if(ENABLE_CUDA) FIND_PACKAGE(CUDA REQUIRED) + add_definitions(-DCUDA_GPU_ACCEL=1) endif(ENABLE_CUDA) set(GNSS_RECEIVER_SOURCES diff --git a/src/core/receiver/gnss_block_factory.cc b/src/core/receiver/gnss_block_factory.cc index 35e4c8360..e521160c2 100644 --- a/src/core/receiver/gnss_block_factory.cc +++ b/src/core/receiver/gnss_block_factory.cc @@ -1610,12 +1610,14 @@ std::unique_ptr GNSSBlockFactory::GetTrkBlock( out_streams, queue)); block = std::move(block_); } +#if CUDA_GPU_ACCEL else if (implementation.compare("GPS_L1_CA_DLL_PLL_Tracking_GPU") == 0) { std::unique_ptr block_(new GpsL1CaDllPllTrackingGPU(configuration.get(), role, in_streams, out_streams, queue)); block = std::move(block_); } +#endif else { // Log fatal. This causes execution to stop. diff --git a/src/main/CMakeLists.txt b/src/main/CMakeLists.txt index 02758105a..4fbafb1fb 100644 --- a/src/main/CMakeLists.txt +++ b/src/main/CMakeLists.txt @@ -33,6 +33,12 @@ if(ENABLE_UHD) set(GNSS_SDR_OPTIONAL_HEADERS ${GNSS_SDR_OPTIONAL_HEADERS} ${UHD_INCLUDE_DIRS}) endif(ENABLE_UHD) +if(ENABLE_CUDA) + FIND_PACKAGE(CUDA REQUIRED) + add_definitions(-DCUDA_GPU_ACCEL=1) +endif(ENABLE_CUDA) + + include_directories( ${CMAKE_SOURCE_DIR}/src/core/system_parameters ${CMAKE_SOURCE_DIR}/src/core/interfaces @@ -48,6 +54,7 @@ include_directories( ${GNURADIO_RUNTIME_INCLUDE_DIRS} ${GNSS_SDR_OPTIONAL_HEADERS} ${VOLK_GNSSSDR_INCLUDE_DIRS} + ${CUDA_INCLUDE_DIRS} ) add_definitions( -DGNSS_SDR_VERSION="${VERSION}" ) @@ -79,6 +86,7 @@ target_link_libraries(gnss-sdr ${MAC_LIBRARIES} ${GNSS_SDR_OPTIONAL_LIBS} gnss_sp_libs gnss_rx + ${CUDA_LIBRARIES} ) diff --git a/src/main/main.cc b/src/main/main.cc index 4659034fb..ce4094ed9 100644 --- a/src/main/main.cc +++ b/src/main/main.cc @@ -68,6 +68,11 @@ #include "sbas_ephemeris.h" #include "sbas_time.h" +#if CUDA_GPU_ACCEL + // For the CUDA runtime routines (prefixed with "cuda_") + #include +#endif + using google::LogMessage; @@ -143,6 +148,17 @@ int main(int argc, char** argv) google::ParseCommandLineFlags(&argc, &argv, true); std::cout << "Initializing GNSS-SDR v" << gnss_sdr_version << " ... Please wait." << std::endl; + #if CUDA_GPU_ACCEL + // Reset the device + // cudaDeviceReset causes the driver to clean up all state. While + // not mandatory in normal operation, it is good practice. It is also + // needed to ensure correct operation when the application is being + // profiled. Calling cudaDeviceReset causes all profile data to be + // flushed before the application exits + cudaDeviceReset(); + std::cout << "Reset CUDA device done " << std::endl; + #endif + if(GOOGLE_STRIP_LOG == 0) { google::InitGoogleLogging(argv[0]);