mirror of https://github.com/gnss-sdr/gnss-sdr
Adding cuda ultra-fast correlator library. Not used yet, but optionally
compiled. All CMAKEs ready!
This commit is contained in:
parent
0927394351
commit
4fc61af172
|
@ -957,6 +957,20 @@ else(ENABLE_OSMOSDR)
|
||||||
message(STATUS "Enable it with 'cmake -DENABLE_OSMOSDR=ON ../' to add support for OsmoSDR and other front-ends (HackRF, bladeRF, Realtek's RTL2832U-based USB dongles, etc.)" )
|
message(STATUS "Enable it with 'cmake -DENABLE_OSMOSDR=ON ../' to add support for OsmoSDR and other front-ends (HackRF, bladeRF, Realtek's RTL2832U-based USB dongles, etc.)" )
|
||||||
endif(ENABLE_OSMOSDR)
|
endif(ENABLE_OSMOSDR)
|
||||||
|
|
||||||
|
if($ENV{CUDA_GPU_ACCEL})
|
||||||
|
message(STATUS "CUDA_GPU_ACCEL environment variable found." )
|
||||||
|
set(ENABLE_CUDA ON)
|
||||||
|
endif($ENV{CUDA_GPU_ACCEL})
|
||||||
|
|
||||||
|
if(ENABLE_CUDA)
|
||||||
|
message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." )
|
||||||
|
message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" )
|
||||||
|
else(ENABLE_CUDA)
|
||||||
|
message(STATUS "NVIDIA CUDA GPU Acceleration will is not enabled." )
|
||||||
|
message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for the Teleorbit Flexiband front-end." )
|
||||||
|
endif(ENABLE_CUDA)
|
||||||
|
|
||||||
|
|
||||||
if($ENV{FLEXIBAND_DRIVER})
|
if($ENV{FLEXIBAND_DRIVER})
|
||||||
message(STATUS "FLEXIBAND_DRIVER environment variable found." )
|
message(STATUS "FLEXIBAND_DRIVER environment variable found." )
|
||||||
set(ENABLE_FLEXIBAND ON)
|
set(ENABLE_FLEXIBAND ON)
|
||||||
|
|
|
@ -16,6 +16,29 @@
|
||||||
# along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
# along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
|
if(ENABLE_CUDA)
|
||||||
|
FIND_PACKAGE(CUDA REQUIRED)
|
||||||
|
|
||||||
|
# Append current NVCC flags by something, eg comput capability
|
||||||
|
# set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30 --default-stream-per-thread)
|
||||||
|
|
||||||
|
list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_30,code=sm_30; -std=c++11;-O3; -use_fast_math")
|
||||||
|
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||||
|
|
||||||
|
CUDA_INCLUDE_DIRECTORIES(
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers
|
||||||
|
)
|
||||||
|
|
||||||
|
SET(LIB_TYPE STATIC) #set the lib type
|
||||||
|
CUDA_ADD_LIBRARY(CUDA_CORRELATOR_LIB ${LIB_TYPE} cuda_multicorrelator.h cuda_multicorrelator.cu)
|
||||||
|
|
||||||
|
set(OPT_LIBRARIES ${OPT_LIBRARIES} ${CUDA_CORRELATOR_LIB})
|
||||||
|
|
||||||
|
endif(ENABLE_CUDA)
|
||||||
|
|
||||||
|
|
||||||
set(TRACKING_LIB_SOURCES
|
set(TRACKING_LIB_SOURCES
|
||||||
correlator.cc
|
correlator.cc
|
||||||
lock_detectors.cc
|
lock_detectors.cc
|
||||||
|
@ -24,7 +47,7 @@ set(TRACKING_LIB_SOURCES
|
||||||
tracking_2nd_DLL_filter.cc
|
tracking_2nd_DLL_filter.cc
|
||||||
tracking_2nd_PLL_filter.cc
|
tracking_2nd_PLL_filter.cc
|
||||||
tracking_discriminators.cc
|
tracking_discriminators.cc
|
||||||
tracking_FLL_PLL_filter.cc
|
tracking_FLL_PLL_filter.cc
|
||||||
)
|
)
|
||||||
|
|
||||||
include_directories(
|
include_directories(
|
||||||
|
@ -43,7 +66,8 @@ if (SSE3_AVAILABLE)
|
||||||
add_definitions( -DHAVE_SSE3=1 )
|
add_definitions( -DHAVE_SSE3=1 )
|
||||||
endif(SSE3_AVAILABLE)
|
endif(SSE3_AVAILABLE)
|
||||||
|
|
||||||
|
|
||||||
file(GLOB TRACKING_LIB_HEADERS "*.h")
|
file(GLOB TRACKING_LIB_HEADERS "*.h")
|
||||||
add_library(tracking_lib ${TRACKING_LIB_SOURCES} ${TRACKING_LIB_HEADERS})
|
add_library(tracking_lib ${TRACKING_LIB_SOURCES} ${TRACKING_LIB_HEADERS})
|
||||||
source_group(Headers FILES ${TRACKING_LIB_HEADERS})
|
source_group(Headers FILES ${TRACKING_LIB_HEADERS})
|
||||||
target_link_libraries(tracking_lib ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES})
|
target_link_libraries(tracking_lib ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES} ${OPT_LIBRARIES})
|
|
@ -0,0 +1,418 @@
|
||||||
|
/*!
|
||||||
|
* \file cuda_multicorrelator.cu
|
||||||
|
* \brief High optimized CUDA GPU vector multiTAP correlator class
|
||||||
|
* \authors <ul>
|
||||||
|
* <li> Javier Arribas, 2015. jarribas(at)cttc.es
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
|
||||||
|
*
|
||||||
|
* -------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
|
||||||
|
*
|
||||||
|
* GNSS-SDR is a software defined Global Navigation
|
||||||
|
* Satellite Systems receiver
|
||||||
|
*
|
||||||
|
* This file is part of GNSS-SDR.
|
||||||
|
*
|
||||||
|
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
* -------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// On G80-class hardware 24-bit multiplication takes 4 clocks per warp
|
||||||
|
// (the same as for floating point multiplication and addition),
|
||||||
|
// whereas full 32-bit multiplication takes 16 clocks per warp.
|
||||||
|
// So if integer multiplication operands are guaranteed to fit into 24 bits
|
||||||
|
// (always lie withtin [-8M, 8M - 1] range in signed case),
|
||||||
|
// explicit 24-bit multiplication is preferred for performance.
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
#define IMUL(a, b) __mul24(a, b)
|
||||||
|
|
||||||
|
#include "cuda_multicorrelator.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
// For the CUDA runtime routines (prefixed with "cuda_")
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
// helper functions and utilities to work with CUDA
|
||||||
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
|
#define ACCUM_N 1024
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Calculate scalar products of VectorN vectors of ElementN elements on GPU
|
||||||
|
// Parameters restrictions:
|
||||||
|
// 1) ElementN is strongly preferred to be a multiple of warp size to
|
||||||
|
// meet alignment constraints of memory coalescing.
|
||||||
|
// 2) ACCUM_N must be a power of two.
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
|
||||||
|
__global__ void scalarProdGPUCPXxN_shifts(
|
||||||
|
GPU_Complex *d_corr_out,
|
||||||
|
GPU_Complex *d_sig_in,
|
||||||
|
GPU_Complex *d_local_codes_in,
|
||||||
|
int *d_shifts_samples,
|
||||||
|
int vectorN,
|
||||||
|
int elementN
|
||||||
|
)
|
||||||
|
{
|
||||||
|
//Accumulators cache
|
||||||
|
__shared__ GPU_Complex accumResult[ACCUM_N];
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Cycle through every pair of vectors,
|
||||||
|
// taking into account that vector counts can be different
|
||||||
|
// from total number of thread blocks
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
|
||||||
|
{
|
||||||
|
int vectorBase = IMUL(elementN, vec);
|
||||||
|
int vectorEnd = vectorBase + elementN;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Each accumulator cycles through vectors with
|
||||||
|
// stride equal to number of total number of accumulators ACCUM_N
|
||||||
|
// At this stage ACCUM_N is only preferred be a multiple of warp size
|
||||||
|
// to meet memory coalescing alignment constraints.
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
|
||||||
|
{
|
||||||
|
GPU_Complex sum = GPU_Complex(0,0);
|
||||||
|
|
||||||
|
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
|
||||||
|
{
|
||||||
|
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
|
||||||
|
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
|
||||||
|
sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos-vectorBase+d_shifts_samples[vec]]);
|
||||||
|
}
|
||||||
|
accumResult[iAccum] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Perform tree-like reduction of accumulators' results.
|
||||||
|
// ACCUM_N has to be power of two at this stage
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
|
||||||
|
{
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
|
||||||
|
{
|
||||||
|
accumResult[iAccum] += accumResult[stride + iAccum];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
{
|
||||||
|
d_corr_out[vec] = accumResult[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__global__ void scalarProdGPUCPXxN(
|
||||||
|
GPU_Complex *d_corr_out,
|
||||||
|
GPU_Complex *d_sig_in,
|
||||||
|
GPU_Complex *d_local_codes_in,
|
||||||
|
int vectorN,
|
||||||
|
int elementN
|
||||||
|
)
|
||||||
|
{
|
||||||
|
//Accumulators cache
|
||||||
|
__shared__ GPU_Complex accumResult[ACCUM_N];
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Cycle through every pair of vectors,
|
||||||
|
// taking into account that vector counts can be different
|
||||||
|
// from total number of thread blocks
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
|
||||||
|
{
|
||||||
|
int vectorBase = IMUL(elementN, vec);
|
||||||
|
int vectorEnd = vectorBase + elementN;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Each accumulator cycles through vectors with
|
||||||
|
// stride equal to number of total number of accumulators ACCUM_N
|
||||||
|
// At this stage ACCUM_N is only preferred be a multiple of warp size
|
||||||
|
// to meet memory coalescing alignment constraints.
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
|
||||||
|
{
|
||||||
|
GPU_Complex sum = GPU_Complex(0,0);
|
||||||
|
|
||||||
|
for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
|
||||||
|
{
|
||||||
|
//sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
|
||||||
|
//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
|
||||||
|
sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]);
|
||||||
|
}
|
||||||
|
accumResult[iAccum] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
// Perform tree-like reduction of accumulators' results.
|
||||||
|
// ACCUM_N has to be power of two at this stage
|
||||||
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
|
||||||
|
{
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
|
||||||
|
{
|
||||||
|
accumResult[iAccum] += accumResult[stride + iAccum];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
{
|
||||||
|
d_corr_out[vec] = accumResult[0];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//*********** CUDA processing **************
|
||||||
|
// Treads: a minimal parallel execution code on GPU
|
||||||
|
// Blocks: a set of N threads
|
||||||
|
/**
|
||||||
|
* CUDA Kernel Device code
|
||||||
|
*
|
||||||
|
* Computes the vectorial product of A and B into C. The 3 vectors have the same
|
||||||
|
* number of elements numElements.
|
||||||
|
*/
|
||||||
|
__global__ void CUDA_32fc_x2_multiply_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
|
||||||
|
{
|
||||||
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i < numElements)
|
||||||
|
{
|
||||||
|
C[i] = A[i] * B[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CUDA Kernel Device code
|
||||||
|
*
|
||||||
|
* Computes the carrier Doppler wipe-off by integrating the NCO in the CUDA kernel
|
||||||
|
*/
|
||||||
|
__global__ void
|
||||||
|
CUDA_32fc_Doppler_wipeoff( GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
|
||||||
|
{
|
||||||
|
//*** NCO CPU code (GNURadio FXP NCO)
|
||||||
|
//float sin_f, cos_f;
|
||||||
|
//float phase_step_rad = static_cast<float>(2 * GALILEO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
|
||||||
|
//int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad);
|
||||||
|
//int phase_rad_i = gr::fxpt::float_to_fixed(d_rem_carr_phase_rad);
|
||||||
|
//
|
||||||
|
//for(int i = 0; i < d_current_prn_length_samples; i++)
|
||||||
|
// {
|
||||||
|
// gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f);
|
||||||
|
// d_carr_sign[i] = std::complex<float>(cos_f, -sin_f);
|
||||||
|
// phase_rad_i += phase_step_rad_i;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// CUDA version of floating point NCO and vector dot product integrated
|
||||||
|
|
||||||
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
float sin;
|
||||||
|
float cos;
|
||||||
|
if (i < numElements)
|
||||||
|
{
|
||||||
|
__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
|
||||||
|
sig_out[i] = sig_in[i] * GPU_Complex(cos,-sin);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CUDA Kernel Device code
|
||||||
|
*
|
||||||
|
* Computes the vectorial product of A and B into C. The 3 vectors have the same
|
||||||
|
* number of elements numElements.
|
||||||
|
*/
|
||||||
|
__global__ void
|
||||||
|
CUDA_32fc_x2_add_32fc( GPU_Complex *A, GPU_Complex *B, GPU_Complex *C, int numElements)
|
||||||
|
{
|
||||||
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i < numElements)
|
||||||
|
{
|
||||||
|
C[i] = A[i] * B[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators)
|
||||||
|
{
|
||||||
|
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
|
||||||
|
findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
|
cudaDeviceProp prop;
|
||||||
|
int whichDevice;
|
||||||
|
cudaGetDevice( &whichDevice );
|
||||||
|
cudaGetDeviceProperties( &prop, whichDevice );
|
||||||
|
//debug code
|
||||||
|
if (prop.canMapHostMemory != 1) {
|
||||||
|
printf( "Device can not map memory.\n" );
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("L2 Cache size= %u \n",prop.l2CacheSize);
|
||||||
|
printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
|
||||||
|
printf("maxGridSize= %i \n",prop.maxGridSize[0]);
|
||||||
|
printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
|
||||||
|
printf("deviceOverlap= %i \n",prop.deviceOverlap);
|
||||||
|
//end debug code
|
||||||
|
|
||||||
|
//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
|
||||||
|
|
||||||
|
|
||||||
|
// ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
|
||||||
|
|
||||||
|
size_t size = signal_length_samples * sizeof(GPU_Complex);
|
||||||
|
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
|
||||||
|
//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
|
||||||
|
|
||||||
|
// old version: all local codes are independent vectors
|
||||||
|
//checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
|
||||||
|
|
||||||
|
// new version: only one vector with extra samples to shift the local code for the correlator set
|
||||||
|
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size+sizeof(GPU_Complex)*shifts_samples[n_correlators-1]));
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, size+sizeof(int)*n_correlators));
|
||||||
|
|
||||||
|
//scalars
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
|
||||||
|
|
||||||
|
// Launch the Vector Add CUDA Kernel
|
||||||
|
threadsPerBlock = 256;
|
||||||
|
blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
|
||||||
|
std::complex<float>* corr_out,
|
||||||
|
const std::complex<float>* sig_in,
|
||||||
|
const std::complex<float>* local_codes_in,
|
||||||
|
float rem_carrier_phase_in_rad,
|
||||||
|
float phase_step_rad,
|
||||||
|
const int *shifts_samples,
|
||||||
|
int signal_length_samples,
|
||||||
|
int n_correlators)
|
||||||
|
{
|
||||||
|
|
||||||
|
cudaStream_t stream1;
|
||||||
|
cudaStream_t stream2;
|
||||||
|
cudaStreamCreate ( &stream1) ;
|
||||||
|
cudaStreamCreate ( &stream2) ;
|
||||||
|
|
||||||
|
size_t memSize = signal_length_samples * sizeof(std::complex<float>);
|
||||||
|
|
||||||
|
// input signal CPU -> GPU copy memory
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
|
||||||
|
cudaMemcpyHostToDevice, stream1));
|
||||||
|
|
||||||
|
//***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
|
||||||
|
//checkCudaErrors(cudaMemcpyAsync(d_nco_in, nco_in, memSize,
|
||||||
|
// cudaMemcpyHostToDevice, stream1));
|
||||||
|
|
||||||
|
|
||||||
|
// old version: all local codes are independent vectors
|
||||||
|
//checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
|
||||||
|
// cudaMemcpyHostToDevice, stream2));
|
||||||
|
|
||||||
|
// new version: only one vector with extra samples to shift the local code for the correlator set
|
||||||
|
// Required: The last correlator tap in d_shifts_samples has the largest sample shift
|
||||||
|
|
||||||
|
// local code CPU -> GPU copy memory
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
|
||||||
|
cudaMemcpyHostToDevice, stream2));
|
||||||
|
// Correlator shifts vector CPU -> GPU copy memory
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
|
||||||
|
cudaMemcpyHostToDevice, stream2));
|
||||||
|
|
||||||
|
|
||||||
|
//Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(stream1));
|
||||||
|
CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
|
||||||
|
|
||||||
|
|
||||||
|
//printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
|
||||||
|
|
||||||
|
//wait for Doppler wipeoff end...
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
//old
|
||||||
|
// scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
|
||||||
|
// d_corr_out,
|
||||||
|
// d_sig_doppler_wiped,
|
||||||
|
// d_local_codes_in,
|
||||||
|
// 3,
|
||||||
|
// signal_length_samples
|
||||||
|
// );
|
||||||
|
|
||||||
|
//new
|
||||||
|
//launch the multitap correlator
|
||||||
|
scalarProdGPUCPXxN_shifts<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
|
||||||
|
d_corr_out,
|
||||||
|
d_sig_doppler_wiped,
|
||||||
|
d_local_codes_in,
|
||||||
|
d_shifts_samples,
|
||||||
|
n_correlators,
|
||||||
|
signal_length_samples
|
||||||
|
);
|
||||||
|
checkCudaErrors(cudaGetLastError());
|
||||||
|
//wait for correlators end...
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
// Copy the device result vector in device memory to the host result vector
|
||||||
|
// in host memory.
|
||||||
|
|
||||||
|
//scalar products (correlators outputs)
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
|
||||||
|
cudaMemcpyDeviceToHost, 0));
|
||||||
|
|
||||||
|
cudaStreamDestroy(stream1) ;
|
||||||
|
cudaStreamDestroy(stream2) ;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool cuda_multicorrelator::free_cuda()
|
||||||
|
{
|
||||||
|
// Free device global memory
|
||||||
|
cudaFree(d_sig_in);
|
||||||
|
//cudaFree(d_nco_in);
|
||||||
|
cudaFree(d_local_codes_in);
|
||||||
|
cudaFree(d_corr_out);
|
||||||
|
|
||||||
|
// Reset the device and exit
|
||||||
|
// cudaDeviceReset causes the driver to clean up all state. While
|
||||||
|
// not mandatory in normal operation, it is good practice. It is also
|
||||||
|
// needed to ensure correct operation when the application is being
|
||||||
|
// profiled. Calling cudaDeviceReset causes all profile data to be
|
||||||
|
// flushed before the application exits
|
||||||
|
checkCudaErrors(cudaDeviceReset());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,138 @@
|
||||||
|
/*!
|
||||||
|
* \file cuda_multicorrelator.h
|
||||||
|
* \brief High optimized CUDA GPU vector multiTAP correlator class
|
||||||
|
* \authors <ul>
|
||||||
|
* <li> Javier Arribas, 2015. jarribas(at)cttc.es
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
|
||||||
|
*
|
||||||
|
* -------------------------------------------------------------------------
|
||||||
|
*
|
||||||
|
* Copyright (C) 2010-2015 (see AUTHORS file for a list of contributors)
|
||||||
|
*
|
||||||
|
* GNSS-SDR is a software defined Global Navigation
|
||||||
|
* Satellite Systems receiver
|
||||||
|
*
|
||||||
|
* This file is part of GNSS-SDR.
|
||||||
|
*
|
||||||
|
* GNSS-SDR is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* GNSS-SDR is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*
|
||||||
|
* -------------------------------------------------------------------------
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef CUDA_MULTICORRELATOR_H_
|
||||||
|
#define CUDA_MULTICORRELATOR_H_
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __CUDACC__
|
||||||
|
#define CUDA_CALLABLE_MEMBER_GLOBAL __global__
|
||||||
|
#define CUDA_CALLABLE_MEMBER_DEVICE __device__
|
||||||
|
#else
|
||||||
|
#define CUDA_CALLABLE_MEMBER_GLOBAL
|
||||||
|
#define CUDA_CALLABLE_MEMBER_DEVICE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <complex>
|
||||||
|
|
||||||
|
// GPU new internal data types for complex numbers
|
||||||
|
|
||||||
|
struct GPU_Complex {
|
||||||
|
float r;
|
||||||
|
float i;
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
|
||||||
|
return r * r + i * i;
|
||||||
|
}
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) {
|
||||||
|
#ifdef __CUDACC__
|
||||||
|
return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i));
|
||||||
|
#else
|
||||||
|
return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) {
|
||||||
|
return GPU_Complex(r+a.r, i+a.i);
|
||||||
|
}
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) {
|
||||||
|
r+=a.r;
|
||||||
|
i+=a.i;
|
||||||
|
}
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
|
||||||
|
{
|
||||||
|
//c=a*b+c
|
||||||
|
//real part
|
||||||
|
//c.r=(a.r*b.r - a.i*b.i)+c.r
|
||||||
|
#ifdef __CUDACC__
|
||||||
|
r=__fmaf_rn(a.r,b.r,r);
|
||||||
|
r=__fmaf_rn(-a.i,b.i,r);
|
||||||
|
//imag part
|
||||||
|
i=__fmaf_rn(a.i,b.r,i);
|
||||||
|
i=__fmaf_rn(a.r,b.i,i);
|
||||||
|
#else
|
||||||
|
r=(a.r*b.r - a.i*b.i)+r;
|
||||||
|
i=(a.i*b.r - a.r*b.i)+i;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct GPU_Complex_Short {
|
||||||
|
float r;
|
||||||
|
float i;
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
|
||||||
|
return r * r + i * i;
|
||||||
|
}
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) {
|
||||||
|
return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
|
||||||
|
}
|
||||||
|
CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) {
|
||||||
|
return GPU_Complex_Short(r+a.r, i+a.i);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
/*!
|
||||||
|
* \brief Class that implements carrier wipe-off and correlators using NVIDIA CUDA GPU accelerators.
|
||||||
|
*/
|
||||||
|
class cuda_multicorrelator
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
bool init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators);
|
||||||
|
|
||||||
|
bool free_cuda();
|
||||||
|
bool Carrier_wipeoff_multicorrelator_cuda(
|
||||||
|
std::complex<float>* corr_out,
|
||||||
|
const std::complex<float>* sig_in,
|
||||||
|
const std::complex<float>* local_codes_in,
|
||||||
|
float rem_carrier_phase_in_rad,
|
||||||
|
float phase_step_rad,
|
||||||
|
const int *shifts_samples,
|
||||||
|
int signal_length_samples,
|
||||||
|
int n_correlators);
|
||||||
|
private:
|
||||||
|
// Allocate the device input vectors
|
||||||
|
GPU_Complex *d_sig_in;
|
||||||
|
GPU_Complex *d_nco_in;
|
||||||
|
GPU_Complex *d_sig_doppler_wiped;
|
||||||
|
GPU_Complex *d_local_codes_in;
|
||||||
|
GPU_Complex *d_corr_out;
|
||||||
|
int *d_shifts_samples;
|
||||||
|
int threadsPerBlock;
|
||||||
|
int blocksPerGrid;
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* CUDA_MULTICORRELATOR_H_ */
|
|
@ -0,0 +1,151 @@
|
||||||
|
/*
|
||||||
|
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* CUda UTility Library */
|
||||||
|
#ifndef _EXCEPTION_H_
|
||||||
|
#define _EXCEPTION_H_
|
||||||
|
|
||||||
|
// includes, system
|
||||||
|
#include <exception>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <iostream>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
//! Exception wrapper.
|
||||||
|
//! @param Std_Exception Exception out of namespace std for easy typing.
|
||||||
|
template<class Std_Exception>
|
||||||
|
class Exception : public Std_Exception
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
|
||||||
|
//! @brief Static construction interface
|
||||||
|
//! @return Alwayss throws ( Located_Exception<Exception>)
|
||||||
|
//! @param file file in which the Exception occurs
|
||||||
|
//! @param line line in which the Exception occurs
|
||||||
|
//! @param detailed details on the code fragment causing the Exception
|
||||||
|
static void throw_it(const char *file,
|
||||||
|
const int line,
|
||||||
|
const char *detailed = "-");
|
||||||
|
|
||||||
|
//! Static construction interface
|
||||||
|
//! @return Alwayss throws ( Located_Exception<Exception>)
|
||||||
|
//! @param file file in which the Exception occurs
|
||||||
|
//! @param line line in which the Exception occurs
|
||||||
|
//! @param detailed details on the code fragment causing the Exception
|
||||||
|
static void throw_it(const char *file,
|
||||||
|
const int line,
|
||||||
|
const std::string &detailed);
|
||||||
|
|
||||||
|
//! Destructor
|
||||||
|
virtual ~Exception() throw();
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
//! Constructor, default (private)
|
||||||
|
Exception();
|
||||||
|
|
||||||
|
//! Constructor, standard
|
||||||
|
//! @param str string returned by what()
|
||||||
|
Exception(const std::string &str);
|
||||||
|
|
||||||
|
};
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Exception handler function for arbitrary exceptions
|
||||||
|
//! @param ex exception to handle
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class Exception_Typ>
|
||||||
|
inline void
|
||||||
|
handleException(const Exception_Typ &ex)
|
||||||
|
{
|
||||||
|
std::cerr << ex.what() << std::endl;
|
||||||
|
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
//! Convenience macros
|
||||||
|
|
||||||
|
//! Exception caused by dynamic program behavior, e.g. file does not exist
|
||||||
|
#define RUNTIME_EXCEPTION( msg) \
|
||||||
|
Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
|
||||||
|
|
||||||
|
//! Logic exception in program, e.g. an assert failed
|
||||||
|
#define LOGIC_EXCEPTION( msg) \
|
||||||
|
Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
|
||||||
|
|
||||||
|
//! Out of range exception
|
||||||
|
#define RANGE_EXCEPTION( msg) \
|
||||||
|
Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Implementation
|
||||||
|
|
||||||
|
// includes, system
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Static construction interface.
|
||||||
|
//! @param Exception causing code fragment (file and line) and detailed infos.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
/*static*/ template<class Std_Exception>
|
||||||
|
void
|
||||||
|
Exception<Std_Exception>::
|
||||||
|
throw_it(const char *file, const int line, const char *detailed)
|
||||||
|
{
|
||||||
|
std::stringstream s;
|
||||||
|
|
||||||
|
// Quiet heavy-weight but exceptions are not for
|
||||||
|
// performance / release versions
|
||||||
|
s << "Exception in file '" << file << "' in line " << line << "\n"
|
||||||
|
<< "Detailed description: " << detailed << "\n";
|
||||||
|
|
||||||
|
throw Exception(s.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Static construction interface.
|
||||||
|
//! @param Exception causing code fragment (file and line) and detailed infos.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
/*static*/ template<class Std_Exception>
|
||||||
|
void
|
||||||
|
Exception<Std_Exception>::
|
||||||
|
throw_it(const char *file, const int line, const std::string &msg)
|
||||||
|
{
|
||||||
|
throw_it(file, line, msg.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Constructor, default (private).
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class Std_Exception>
|
||||||
|
Exception<Std_Exception>::Exception() :
|
||||||
|
Std_Exception("Unknown Exception.\n")
|
||||||
|
{ }
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Constructor, standard (private).
|
||||||
|
//! String returned by what().
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class Std_Exception>
|
||||||
|
Exception<Std_Exception>::Exception(const std::string &s) :
|
||||||
|
Std_Exception(s)
|
||||||
|
{ }
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Destructor
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
template<class Std_Exception>
|
||||||
|
Exception<Std_Exception>::~Exception() throw() { }
|
||||||
|
|
||||||
|
// functions, exported
|
||||||
|
|
||||||
|
#endif // #ifndef _EXCEPTION_H_
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,517 @@
|
||||||
|
/**
|
||||||
|
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Helper functions for CUDA Driver API error handling (make sure that CUDA_H is included in your projects)
|
||||||
|
#ifndef HELPER_CUDA_DRVAPI_H
|
||||||
|
#define HELPER_CUDA_DRVAPI_H
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include <helper_string.h>
|
||||||
|
#include <drvapi_error_string.h>
|
||||||
|
|
||||||
|
#ifndef MAX
|
||||||
|
#define MAX(a,b) (a > b ? a : b)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef HELPER_CUDA_H
|
||||||
|
inline int ftoi(float value)
|
||||||
|
{
|
||||||
|
return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef EXIT_WAIVED
|
||||||
|
#define EXIT_WAIVED 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// These are CUDA Helper functions
|
||||||
|
|
||||||
|
// add a level of protection to the CUDA SDK samples, let's force samples to explicitly include CUDA.H
|
||||||
|
#ifdef __cuda_cuda_h__
|
||||||
|
// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
|
||||||
|
#ifndef checkCudaErrors
|
||||||
|
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
|
||||||
|
|
||||||
|
// These are the inline versions for all of the SDK helper functions
|
||||||
|
inline void __checkCudaErrors(CUresult err, const char *file, const int line)
|
||||||
|
{
|
||||||
|
if (CUDA_SUCCESS != err)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n",
|
||||||
|
err, getCudaDrvErrorString(err), file, line);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef getLastCudaDrvErrorMsg
|
||||||
|
#undef getLastCudaDrvErrorMsg
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define getLastCudaDrvErrorMsg(msg) __getLastCudaDrvErrorMsg (msg, __FILE__, __LINE__)
|
||||||
|
|
||||||
|
inline void __getLastCudaDrvErrorMsg(const char *msg, const char *file, const int line)
|
||||||
|
{
|
||||||
|
CUresult err = cuCtxSynchronize();
|
||||||
|
|
||||||
|
if (CUDA_SUCCESS != err)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "getLastCudaDrvErrorMsg -> %s", msg);
|
||||||
|
fprintf(stderr, "getLastCudaDrvErrorMsg -> cuCtxSynchronize API error = %04d \"%s\" in file <%s>, line %i.\n",
|
||||||
|
err, getCudaDrvErrorString(err), file, line);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function wraps the CUDA Driver API into a template function
|
||||||
|
template <class T>
|
||||||
|
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
||||||
|
{
|
||||||
|
CUresult error_result = cuDeviceGetAttribute(attribute, device_attribute, device);
|
||||||
|
|
||||||
|
if (error_result != CUDA_SUCCESS)
|
||||||
|
{
|
||||||
|
printf("cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result));
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Beginning of GPU Architecture definitions
|
||||||
|
inline int _ConvertSMVer2CoresDRV(int major, int minor)
|
||||||
|
{
|
||||||
|
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||||
|
typedef struct
|
||||||
|
{
|
||||||
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
|
sSMtoCores nGpuArchCoresPerSM[] =
|
||||||
|
{
|
||||||
|
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||||
|
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||||
|
{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
|
||||||
|
{ 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
|
||||||
|
{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
|
||||||
|
{ 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
|
||||||
|
{ 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
|
||||||
|
{ 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
|
||||||
|
{ -1, -1 }
|
||||||
|
};
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
|
||||||
|
while (nGpuArchCoresPerSM[index].SM != -1)
|
||||||
|
{
|
||||||
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
|
||||||
|
{
|
||||||
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we don't find the values, we default use the previous one to run properly
|
||||||
|
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
|
||||||
|
return nGpuArchCoresPerSM[index-1].Cores;
|
||||||
|
}
|
||||||
|
// end of GPU Architecture definitions
|
||||||
|
|
||||||
|
#ifdef __cuda_cuda_h__
|
||||||
|
// General GPU Device CUDA Initialization
|
||||||
|
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
|
||||||
|
{
|
||||||
|
int cuDevice = 0;
|
||||||
|
int deviceCount = 0;
|
||||||
|
CUresult err = cuInit(0);
|
||||||
|
|
||||||
|
if (CUDA_SUCCESS == err)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (deviceCount == 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
int dev = 0;
|
||||||
|
dev = getCmdLineArgumentInt(ARGC, (const char **) ARGV, "device=");
|
||||||
|
|
||||||
|
if (dev < 0)
|
||||||
|
{
|
||||||
|
dev = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dev > deviceCount-1)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
|
||||||
|
fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
return -dev;
|
||||||
|
}
|
||||||
|
|
||||||
|
checkCudaErrors(cuDeviceGet(&cuDevice, dev));
|
||||||
|
char name[100];
|
||||||
|
cuDeviceGetName(name, 100, cuDevice);
|
||||||
|
|
||||||
|
int computeMode;
|
||||||
|
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
|
||||||
|
|
||||||
|
if (computeMode == CU_COMPUTEMODE_PROHIBITED)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no threads can use this CUDA Device.\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == false)
|
||||||
|
{
|
||||||
|
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
return dev;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function returns the best GPU based on performance
|
||||||
|
inline int gpuGetMaxGflopsDeviceIdDRV()
|
||||||
|
{
|
||||||
|
CUdevice current_device = 0;
|
||||||
|
CUdevice max_perf_device = 0;
|
||||||
|
int device_count = 0;
|
||||||
|
int sm_per_multiproc = 0;
|
||||||
|
unsigned long long max_compute_perf = 0;
|
||||||
|
int best_SM_arch = 0;
|
||||||
|
int major = 0;
|
||||||
|
int minor = 0;
|
||||||
|
int multiProcessorCount;
|
||||||
|
int clockRate;
|
||||||
|
int devices_prohibited = 0;
|
||||||
|
|
||||||
|
cuInit(0);
|
||||||
|
checkCudaErrors(cuDeviceGetCount(&device_count));
|
||||||
|
|
||||||
|
if (device_count == 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the best major SM Architecture GPU device
|
||||||
|
while (current_device < device_count)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
|
||||||
|
|
||||||
|
if (major > 0 && major < 9999)
|
||||||
|
{
|
||||||
|
best_SM_arch = MAX(best_SM_arch, major);
|
||||||
|
}
|
||||||
|
|
||||||
|
current_device++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the best CUDA capable GPU device
|
||||||
|
current_device = 0;
|
||||||
|
|
||||||
|
while (current_device < device_count)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
|
||||||
|
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
||||||
|
current_device));
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&clockRate,
|
||||||
|
CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
|
||||||
|
current_device));
|
||||||
|
checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
|
||||||
|
|
||||||
|
int computeMode;
|
||||||
|
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
|
||||||
|
|
||||||
|
if (computeMode != CU_COMPUTEMODE_PROHIBITED)
|
||||||
|
{
|
||||||
|
if (major == 9999 && minor == 9999)
|
||||||
|
{
|
||||||
|
sm_per_multiproc = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned long long compute_perf = (unsigned long long) (multiProcessorCount * sm_per_multiproc * clockRate);
|
||||||
|
|
||||||
|
if (compute_perf > max_compute_perf)
|
||||||
|
{
|
||||||
|
// If we find GPU with SM major > 2, search only these
|
||||||
|
if (best_SM_arch > 2)
|
||||||
|
{
|
||||||
|
// If our device==dest_SM_arch, choose this, or else pass
|
||||||
|
if (major == best_SM_arch)
|
||||||
|
{
|
||||||
|
max_compute_perf = compute_perf;
|
||||||
|
max_perf_device = current_device;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
max_compute_perf = compute_perf;
|
||||||
|
max_perf_device = current_device;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
devices_prohibited++;
|
||||||
|
}
|
||||||
|
|
||||||
|
++current_device;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (devices_prohibited == device_count)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode prohibited.\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
return max_perf_device;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function returns the best Graphics GPU based on performance
|
||||||
|
inline int gpuGetMaxGflopsGLDeviceIdDRV()
|
||||||
|
{
|
||||||
|
CUdevice current_device = 0, max_perf_device = 0;
|
||||||
|
int device_count = 0, sm_per_multiproc = 0;
|
||||||
|
int max_compute_perf = 0, best_SM_arch = 0;
|
||||||
|
int major = 0, minor = 0, multiProcessorCount, clockRate;
|
||||||
|
int bTCC = 0;
|
||||||
|
int devices_prohibited = 0;
|
||||||
|
char deviceName[256];
|
||||||
|
|
||||||
|
cuInit(0);
|
||||||
|
checkCudaErrors(cuDeviceGetCount(&device_count));
|
||||||
|
|
||||||
|
if (device_count == 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: no devices supporting CUDA\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the best major SM Architecture GPU device that are graphics devices
|
||||||
|
while (current_device < device_count)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
|
||||||
|
checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
|
||||||
|
|
||||||
|
#if CUDA_VERSION >= 3020
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
|
||||||
|
#else
|
||||||
|
|
||||||
|
// Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
|
||||||
|
if (deviceName[0] == 'T')
|
||||||
|
{
|
||||||
|
bTCC = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int computeMode;
|
||||||
|
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
|
||||||
|
|
||||||
|
if (computeMode != CU_COMPUTEMODE_PROHIBITED)
|
||||||
|
{
|
||||||
|
if (!bTCC)
|
||||||
|
{
|
||||||
|
if (major > 0 && major < 9999)
|
||||||
|
{
|
||||||
|
best_SM_arch = MAX(best_SM_arch, major);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
devices_prohibited++;
|
||||||
|
}
|
||||||
|
|
||||||
|
current_device++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (devices_prohibited == device_count)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: all devices have compute mode prohibited.\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the best CUDA capable GPU device
|
||||||
|
current_device = 0;
|
||||||
|
|
||||||
|
while (current_device < device_count)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
|
||||||
|
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
||||||
|
current_device));
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&clockRate,
|
||||||
|
CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
|
||||||
|
current_device));
|
||||||
|
checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
|
||||||
|
|
||||||
|
#if CUDA_VERSION >= 3020
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
|
||||||
|
#else
|
||||||
|
|
||||||
|
// Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
|
||||||
|
if (deviceName[0] == 'T')
|
||||||
|
{
|
||||||
|
bTCC = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int computeMode;
|
||||||
|
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
|
||||||
|
|
||||||
|
if (computeMode != CU_COMPUTEMODE_PROHIBITED)
|
||||||
|
{
|
||||||
|
if (major == 9999 && minor == 9999)
|
||||||
|
{
|
||||||
|
sm_per_multiproc = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor
|
||||||
|
if (!bTCC) // Is this GPU running the TCC driver? If so we pass on this
|
||||||
|
{
|
||||||
|
int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
|
||||||
|
|
||||||
|
if (compute_perf > max_compute_perf)
|
||||||
|
{
|
||||||
|
// If we find GPU with SM major > 2, search only these
|
||||||
|
if (best_SM_arch > 2)
|
||||||
|
{
|
||||||
|
// If our device = dest_SM_arch, then we pick this one
|
||||||
|
if (major == best_SM_arch)
|
||||||
|
{
|
||||||
|
max_compute_perf = compute_perf;
|
||||||
|
max_perf_device = current_device;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
max_compute_perf = compute_perf;
|
||||||
|
max_perf_device = current_device;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
++current_device;
|
||||||
|
}
|
||||||
|
|
||||||
|
return max_perf_device;
|
||||||
|
}
|
||||||
|
|
||||||
|
// General initialization call to pick the best CUDA Device
|
||||||
|
inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
|
||||||
|
{
|
||||||
|
CUdevice cuDevice;
|
||||||
|
int devID = 0;
|
||||||
|
|
||||||
|
// If the command-line has a device number specified, use it
|
||||||
|
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
|
||||||
|
{
|
||||||
|
devID = gpuDeviceInitDRV(argc, argv);
|
||||||
|
|
||||||
|
if (devID < 0)
|
||||||
|
{
|
||||||
|
printf("exiting...\n");
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Otherwise pick the device with highest Gflops/s
|
||||||
|
char name[100];
|
||||||
|
devID = gpuGetMaxGflopsDeviceIdDRV();
|
||||||
|
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||||
|
cuDeviceGetName(name, 100, cuDevice);
|
||||||
|
printf("> Using CUDA Device [%d]: %s\n", devID, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
cuDeviceGet(&cuDevice, devID);
|
||||||
|
|
||||||
|
return cuDevice;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function will pick the best CUDA device available with OpenGL interop
|
||||||
|
inline CUdevice findCudaGLDeviceDRV(int argc, const char **argv)
|
||||||
|
{
|
||||||
|
CUdevice cuDevice;
|
||||||
|
int devID = 0;
|
||||||
|
|
||||||
|
// If the command-line has a device number specified, use it
|
||||||
|
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
|
||||||
|
{
|
||||||
|
devID = gpuDeviceInitDRV(argc, (const char **)argv);
|
||||||
|
|
||||||
|
if (devID < 0)
|
||||||
|
{
|
||||||
|
printf("no CUDA capable devices found, exiting...\n");
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
char name[100];
|
||||||
|
// Otherwise pick the device with highest Gflops/s
|
||||||
|
devID = gpuGetMaxGflopsGLDeviceIdDRV();
|
||||||
|
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||||
|
cuDeviceGetName(name, 100, cuDevice);
|
||||||
|
printf("> Using CUDA/GL Device [%d]: %s\n", devID, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
return devID;
|
||||||
|
}
|
||||||
|
|
||||||
|
// General check for CUDA GPU SM Capabilities
|
||||||
|
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
|
||||||
|
{
|
||||||
|
CUdevice cuDevice;
|
||||||
|
char name[256];
|
||||||
|
int major = 0, minor = 0;
|
||||||
|
|
||||||
|
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||||
|
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
||||||
|
checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID));
|
||||||
|
|
||||||
|
if ((major > major_version) ||
|
||||||
|
(major == major_version && minor >= minor_version))
|
||||||
|
{
|
||||||
|
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// end of CUDA Helper Functions
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,165 @@
|
||||||
|
/**
|
||||||
|
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef HELPER_CUDA_GL_H
|
||||||
|
#define HELPER_CUDA_GL_H
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
// includes, graphics
|
||||||
|
#if defined (__APPLE__) || defined(MACOSX)
|
||||||
|
#include <OpenGL/gl.h>
|
||||||
|
#include <OpenGL/glu.h>
|
||||||
|
#else
|
||||||
|
#include <GL/gl.h>
|
||||||
|
#include <GL/glu.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef EXIT_WAIVED
|
||||||
|
#define EXIT_WAIVED 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __DRIVER_TYPES_H__
|
||||||
|
#ifndef DEVICE_RESET
|
||||||
|
#define DEVICE_RESET cudaDeviceReset()
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#ifndef DEVICE_RESET
|
||||||
|
#define DEVICE_RESET
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __CUDA_GL_INTEROP_H__
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// These are CUDA OpenGL Helper functions
|
||||||
|
|
||||||
|
inline int gpuGLDeviceInit(int ARGC, const char **ARGV)
|
||||||
|
{
|
||||||
|
int deviceCount;
|
||||||
|
checkCudaErrors(cudaGetDeviceCount(&deviceCount));
|
||||||
|
|
||||||
|
if (deviceCount == 0)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
int dev = 0;
|
||||||
|
dev = getCmdLineArgumentInt(ARGC, ARGV, "device=");
|
||||||
|
|
||||||
|
if (dev < 0)
|
||||||
|
{
|
||||||
|
dev = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dev > deviceCount-1)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
|
||||||
|
fprintf(stderr, ">> gpuGLDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
return -dev;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
|
||||||
|
|
||||||
|
if (deviceProp.computeMode == cudaComputeModeProhibited)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (deviceProp.major < 1)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error: device does not support CUDA.\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkCmdLineFlag(ARGC, ARGV, "quiet") == false)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
checkCudaErrors(cudaGLSetGLDevice(dev));
|
||||||
|
return dev;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function will pick the best CUDA device available with OpenGL interop
|
||||||
|
inline int findCudaGLDevice(int argc, const char **argv)
|
||||||
|
{
|
||||||
|
int devID = 0;
|
||||||
|
|
||||||
|
// If the command-line has a device number specified, use it
|
||||||
|
if (checkCmdLineFlag(argc, (const char **)argv, "device"))
|
||||||
|
{
|
||||||
|
devID = gpuGLDeviceInit(argc, (const char **)argv);
|
||||||
|
|
||||||
|
if (devID < 0)
|
||||||
|
{
|
||||||
|
printf("no CUDA capable devices found, exiting...\n");
|
||||||
|
DEVICE_RESET
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Otherwise pick the device with highest Gflops/s
|
||||||
|
devID = gpuGetMaxGflopsDeviceId();
|
||||||
|
cudaGLSetGLDevice(devID);
|
||||||
|
}
|
||||||
|
|
||||||
|
return devID;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Check for OpenGL error
|
||||||
|
//! @return bool if no GL error has been encountered, otherwise 0
|
||||||
|
//! @param file __FILE__ macro
|
||||||
|
//! @param line __LINE__ macro
|
||||||
|
//! @note The GL error is listed on stderr
|
||||||
|
//! @note This function should be used via the CHECK_ERROR_GL() macro
|
||||||
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline bool
|
||||||
|
sdkCheckErrorGL(const char *file, const int line)
|
||||||
|
{
|
||||||
|
bool ret_val = true;
|
||||||
|
|
||||||
|
// check for error
|
||||||
|
GLenum gl_error = glGetError();
|
||||||
|
|
||||||
|
if (gl_error != GL_NO_ERROR)
|
||||||
|
{
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
char tmpStr[512];
|
||||||
|
// NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
|
||||||
|
// when the user double clicks on the error line in the Output pane. Like any compile error.
|
||||||
|
sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", file, line, gluErrorString(gl_error));
|
||||||
|
fprintf(stderr, "%s", tmpStr);
|
||||||
|
#endif
|
||||||
|
fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line);
|
||||||
|
fprintf(stderr, "%s\n", gluErrorString(gl_error));
|
||||||
|
ret_val = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SDK_CHECK_ERROR_GL() \
|
||||||
|
if( false == sdkCheckErrorGL( __FILE__, __LINE__)) { \
|
||||||
|
DEVICE_RESET \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,42 @@
|
||||||
|
/**
|
||||||
|
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
// These are helper functions for the SDK samples (string parsing, timers, image helpers, etc)
|
||||||
|
#ifndef HELPER_FUNCTIONS_H
|
||||||
|
#define HELPER_FUNCTIONS_H
|
||||||
|
|
||||||
|
#ifdef WIN32
|
||||||
|
#pragma warning(disable:4996)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// includes, project
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <exception.h>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
// includes, timer, string parsing, image helpers
|
||||||
|
#include <helper_timer.h> // helper functions for timers
|
||||||
|
#include <helper_string.h> // helper functions for string parsing
|
||||||
|
#include <helper_image.h> // helper functions for image compare, dump, data comparisons
|
||||||
|
|
||||||
|
#ifndef EXIT_WAIVED
|
||||||
|
#define EXIT_WAIVED 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // HELPER_FUNCTIONS_H
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,516 @@
|
||||||
|
/**
|
||||||
|
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
// These are helper functions for the SDK samples (string parsing, timers, etc)
|
||||||
|
#ifndef STRING_HELPER_H
|
||||||
|
#define STRING_HELPER_H
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
#ifndef _CRT_SECURE_NO_DEPRECATE
|
||||||
|
#define _CRT_SECURE_NO_DEPRECATE
|
||||||
|
#endif
|
||||||
|
#ifndef STRCASECMP
|
||||||
|
#define STRCASECMP _stricmp
|
||||||
|
#endif
|
||||||
|
#ifndef STRNCASECMP
|
||||||
|
#define STRNCASECMP _strnicmp
|
||||||
|
#endif
|
||||||
|
#ifndef STRCPY
|
||||||
|
#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef FOPEN
|
||||||
|
#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
|
||||||
|
#endif
|
||||||
|
#ifndef FOPEN_FAIL
|
||||||
|
#define FOPEN_FAIL(result) (result != 0)
|
||||||
|
#endif
|
||||||
|
#ifndef SSCANF
|
||||||
|
#define SSCANF sscanf_s
|
||||||
|
#endif
|
||||||
|
#ifndef SPRINTF
|
||||||
|
#define SPRINTF sprintf_s
|
||||||
|
#endif
|
||||||
|
#else // Linux Includes
|
||||||
|
#include <string.h>
|
||||||
|
#include <strings.h>
|
||||||
|
|
||||||
|
#ifndef STRCASECMP
|
||||||
|
#define STRCASECMP strcasecmp
|
||||||
|
#endif
|
||||||
|
#ifndef STRNCASECMP
|
||||||
|
#define STRNCASECMP strncasecmp
|
||||||
|
#endif
|
||||||
|
#ifndef STRCPY
|
||||||
|
#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef FOPEN
|
||||||
|
#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
|
||||||
|
#endif
|
||||||
|
#ifndef FOPEN_FAIL
|
||||||
|
#define FOPEN_FAIL(result) (result == NULL)
|
||||||
|
#endif
|
||||||
|
#ifndef SSCANF
|
||||||
|
#define SSCANF sscanf
|
||||||
|
#endif
|
||||||
|
#ifndef SPRINTF
|
||||||
|
#define SPRINTF sprintf
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef EXIT_WAIVED
|
||||||
|
#define EXIT_WAIVED 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// CUDA Utility Helper Functions
|
||||||
|
inline int stringRemoveDelimiter(char delimiter, const char *string)
|
||||||
|
{
|
||||||
|
int string_start = 0;
|
||||||
|
|
||||||
|
while (string[string_start] == delimiter)
|
||||||
|
{
|
||||||
|
string_start++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string_start >= (int)strlen(string)-1)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return string_start;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int getFileExtension(char *filename, char **extension)
|
||||||
|
{
|
||||||
|
int string_length = (int)strlen(filename);
|
||||||
|
|
||||||
|
while (filename[string_length--] != '.')
|
||||||
|
{
|
||||||
|
if (string_length == 0)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (string_length > 0) string_length += 2;
|
||||||
|
|
||||||
|
if (string_length == 0)
|
||||||
|
*extension = NULL;
|
||||||
|
else
|
||||||
|
*extension = &filename[string_length];
|
||||||
|
|
||||||
|
return string_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
|
||||||
|
{
|
||||||
|
bool bFound = false;
|
||||||
|
|
||||||
|
if (argc >= 1)
|
||||||
|
{
|
||||||
|
for (int i=1; i < argc; i++)
|
||||||
|
{
|
||||||
|
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
|
||||||
|
const char *equal_pos = strchr(string_argv, '=');
|
||||||
|
int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
|
||||||
|
|
||||||
|
int length = (int)strlen(string_ref);
|
||||||
|
|
||||||
|
if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
|
||||||
|
{
|
||||||
|
bFound = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bFound;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function wraps the CUDA Driver API into a template function
|
||||||
|
template <class T>
|
||||||
|
inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
|
||||||
|
{
|
||||||
|
bool bFound = false;
|
||||||
|
|
||||||
|
if (argc >= 1)
|
||||||
|
{
|
||||||
|
for (int i=1; i < argc; i++)
|
||||||
|
{
|
||||||
|
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
int length = (int)strlen(string_ref);
|
||||||
|
|
||||||
|
if (!STRNCASECMP(string_argv, string_ref, length))
|
||||||
|
{
|
||||||
|
if (length+1 <= (int)strlen(string_argv))
|
||||||
|
{
|
||||||
|
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
|
||||||
|
*value = (T)atoi(&string_argv[length + auto_inc]);
|
||||||
|
}
|
||||||
|
|
||||||
|
bFound = true;
|
||||||
|
i=argc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bFound;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
|
||||||
|
{
|
||||||
|
bool bFound = false;
|
||||||
|
int value = -1;
|
||||||
|
|
||||||
|
if (argc >= 1)
|
||||||
|
{
|
||||||
|
for (int i=1; i < argc; i++)
|
||||||
|
{
|
||||||
|
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
int length = (int)strlen(string_ref);
|
||||||
|
|
||||||
|
if (!STRNCASECMP(string_argv, string_ref, length))
|
||||||
|
{
|
||||||
|
if (length+1 <= (int)strlen(string_argv))
|
||||||
|
{
|
||||||
|
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
|
||||||
|
value = atoi(&string_argv[length + auto_inc]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
value = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
bFound = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bFound)
|
||||||
|
{
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
|
||||||
|
{
|
||||||
|
bool bFound = false;
|
||||||
|
float value = -1;
|
||||||
|
|
||||||
|
if (argc >= 1)
|
||||||
|
{
|
||||||
|
for (int i=1; i < argc; i++)
|
||||||
|
{
|
||||||
|
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||||
|
const char *string_argv = &argv[i][string_start];
|
||||||
|
int length = (int)strlen(string_ref);
|
||||||
|
|
||||||
|
if (!STRNCASECMP(string_argv, string_ref, length))
|
||||||
|
{
|
||||||
|
if (length+1 <= (int)strlen(string_argv))
|
||||||
|
{
|
||||||
|
int auto_inc = (string_argv[length] == '=') ? 1 : 0;
|
||||||
|
value = (float)atof(&string_argv[length + auto_inc]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
value = 0.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
bFound = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bFound)
|
||||||
|
{
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline bool getCmdLineArgumentString(const int argc, const char **argv,
|
||||||
|
const char *string_ref, char **string_retval)
|
||||||
|
{
|
||||||
|
bool bFound = false;
|
||||||
|
|
||||||
|
if (argc >= 1)
|
||||||
|
{
|
||||||
|
for (int i=1; i < argc; i++)
|
||||||
|
{
|
||||||
|
int string_start = stringRemoveDelimiter('-', argv[i]);
|
||||||
|
char *string_argv = (char *)&argv[i][string_start];
|
||||||
|
int length = (int)strlen(string_ref);
|
||||||
|
|
||||||
|
if (!STRNCASECMP(string_argv, string_ref, length))
|
||||||
|
{
|
||||||
|
*string_retval = &string_argv[length+1];
|
||||||
|
bFound = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!bFound)
|
||||||
|
{
|
||||||
|
*string_retval = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return bFound;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Find the path for a file assuming that
|
||||||
|
//! files are found in the searchPath.
|
||||||
|
//!
|
||||||
|
//! @return the path if succeeded, otherwise 0
|
||||||
|
//! @param filename name of the file
|
||||||
|
//! @param executable_path optional absolute path of the executable
|
||||||
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline char *sdkFindFilePath(const char *filename, const char *executable_path)
|
||||||
|
{
|
||||||
|
// <executable_name> defines a variable that is replaced with the name of the executable
|
||||||
|
|
||||||
|
// Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
|
||||||
|
// The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
|
||||||
|
const char *searchPath[] =
|
||||||
|
{
|
||||||
|
"./", // same dir
|
||||||
|
"./common/", // "/common/" subdir
|
||||||
|
"./common/data/", // "/common/data/" subdir
|
||||||
|
"./data/", // "/data/" subdir
|
||||||
|
"./src/", // "/src/" subdir
|
||||||
|
"./src/<executable_name>/data/", // "/src/<executable_name>/data/" subdir
|
||||||
|
"./inc/", // "/inc/" subdir
|
||||||
|
"./0_Simple/", // "/0_Simple/" subdir
|
||||||
|
"./1_Utilities/", // "/1_Utilities/" subdir
|
||||||
|
"./2_Graphics/", // "/2_Graphics/" subdir
|
||||||
|
"./3_Imaging/", // "/3_Imaging/" subdir
|
||||||
|
"./4_Finance/", // "/4_Finance/" subdir
|
||||||
|
"./5_Simulations/", // "/5_Simulations/" subdir
|
||||||
|
"./6_Advanced/", // "/6_Advanced/" subdir
|
||||||
|
"./7_CUDALibraries/", // "/7_CUDALibraries/" subdir
|
||||||
|
"./8_Android/", // "/8_Android/" subdir
|
||||||
|
"./samples/", // "/samples/" subdir
|
||||||
|
|
||||||
|
"../", // up 1 in tree
|
||||||
|
"../common/", // up 1 in tree, "/common/" subdir
|
||||||
|
"../common/data/", // up 1 in tree, "/common/data/" subdir
|
||||||
|
"../data/", // up 1 in tree, "/data/" subdir
|
||||||
|
"../src/", // up 1 in tree, "/src/" subdir
|
||||||
|
"../inc/", // up 1 in tree, "/inc/" subdir
|
||||||
|
|
||||||
|
"../0_Simple/<executable_name>/data/", // up 1 in tree, "/0_Simple/<executable_name>/" subdir
|
||||||
|
"../1_Utilities/<executable_name>/data/", // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
|
||||||
|
"../2_Graphics/<executable_name>/data/", // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
|
||||||
|
"../3_Imaging/<executable_name>/data/", // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
|
||||||
|
"../4_Finance/<executable_name>/data/", // up 1 in tree, "/4_Finance/<executable_name>/" subdir
|
||||||
|
"../5_Simulations/<executable_name>/data/", // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
|
||||||
|
"../6_Advanced/<executable_name>/data/", // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
|
||||||
|
"../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
|
||||||
|
"../8_Android/<executable_name>/data/", // up 1 in tree, "/8_Android/<executable_name>/" subdir
|
||||||
|
"../samples/<executable_name>/data/", // up 1 in tree, "/samples/<executable_name>/" subdir
|
||||||
|
"../../", // up 2 in tree
|
||||||
|
"../../common/", // up 2 in tree, "/common/" subdir
|
||||||
|
"../../common/data/", // up 2 in tree, "/common/data/" subdir
|
||||||
|
"../../data/", // up 2 in tree, "/data/" subdir
|
||||||
|
"../../src/", // up 2 in tree, "/src/" subdir
|
||||||
|
"../../inc/", // up 2 in tree, "/inc/" subdir
|
||||||
|
"../../sandbox/<executable_name>/data/", // up 2 in tree, "/sandbox/<executable_name>/" subdir
|
||||||
|
"../../0_Simple/<executable_name>/data/", // up 2 in tree, "/0_Simple/<executable_name>/" subdir
|
||||||
|
"../../1_Utilities/<executable_name>/data/", // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
|
||||||
|
"../../2_Graphics/<executable_name>/data/", // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
|
||||||
|
"../../3_Imaging/<executable_name>/data/", // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
|
||||||
|
"../../4_Finance/<executable_name>/data/", // up 2 in tree, "/4_Finance/<executable_name>/" subdir
|
||||||
|
"../../5_Simulations/<executable_name>/data/", // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
|
||||||
|
"../../6_Advanced/<executable_name>/data/", // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
|
||||||
|
"../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
|
||||||
|
"../../8_Android/<executable_name>/data/", // up 2 in tree, "/8_Android/<executable_name>/" subdir
|
||||||
|
"../../samples/<executable_name>/data/", // up 2 in tree, "/samples/<executable_name>/" subdir
|
||||||
|
"../../../", // up 3 in tree
|
||||||
|
"../../../src/<executable_name>/", // up 3 in tree, "/src/<executable_name>/" subdir
|
||||||
|
"../../../src/<executable_name>/data/", // up 3 in tree, "/src/<executable_name>/data/" subdir
|
||||||
|
"../../../src/<executable_name>/src/", // up 3 in tree, "/src/<executable_name>/src/" subdir
|
||||||
|
"../../../src/<executable_name>/inc/", // up 3 in tree, "/src/<executable_name>/inc/" subdir
|
||||||
|
"../../../sandbox/<executable_name>/", // up 3 in tree, "/sandbox/<executable_name>/" subdir
|
||||||
|
"../../../sandbox/<executable_name>/data/", // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
|
||||||
|
"../../../sandbox/<executable_name>/src/", // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
|
||||||
|
"../../../sandbox/<executable_name>/inc/", // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
|
||||||
|
"../../../0_Simple/<executable_name>/data/", // up 3 in tree, "/0_Simple/<executable_name>/" subdir
|
||||||
|
"../../../1_Utilities/<executable_name>/data/", // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
|
||||||
|
"../../../2_Graphics/<executable_name>/data/", // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
|
||||||
|
"../../../3_Imaging/<executable_name>/data/", // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
|
||||||
|
"../../../4_Finance/<executable_name>/data/", // up 3 in tree, "/4_Finance/<executable_name>/" subdir
|
||||||
|
"../../../5_Simulations/<executable_name>/data/", // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
|
||||||
|
"../../../6_Advanced/<executable_name>/data/", // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
|
||||||
|
"../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
|
||||||
|
"../../../8_Android/<executable_name>/data/", // up 3 in tree, "/8_Android/<executable_name>/" subdir
|
||||||
|
"../../../0_Simple/<executable_name>/", // up 3 in tree, "/0_Simple/<executable_name>/" subdir
|
||||||
|
"../../../1_Utilities/<executable_name>/", // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
|
||||||
|
"../../../2_Graphics/<executable_name>/", // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
|
||||||
|
"../../../3_Imaging/<executable_name>/", // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
|
||||||
|
"../../../4_Finance/<executable_name>/", // up 3 in tree, "/4_Finance/<executable_name>/" subdir
|
||||||
|
"../../../5_Simulations/<executable_name>/", // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
|
||||||
|
"../../../6_Advanced/<executable_name>/", // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
|
||||||
|
"../../../7_CUDALibraries/<executable_name>/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
|
||||||
|
"../../../8_Android/<executable_name>/", // up 3 in tree, "/8_Android/<executable_name>/" subdir
|
||||||
|
"../../../samples/<executable_name>/data/", // up 3 in tree, "/samples/<executable_name>/" subdir
|
||||||
|
"../../../common/", // up 3 in tree, "../../../common/" subdir
|
||||||
|
"../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
|
||||||
|
"../../../data/", // up 3 in tree, "../../../data/" subdir
|
||||||
|
"../../../../", // up 4 in tree
|
||||||
|
"../../../../src/<executable_name>/", // up 4 in tree, "/src/<executable_name>/" subdir
|
||||||
|
"../../../../src/<executable_name>/data/", // up 4 in tree, "/src/<executable_name>/data/" subdir
|
||||||
|
"../../../../src/<executable_name>/src/", // up 4 in tree, "/src/<executable_name>/src/" subdir
|
||||||
|
"../../../../src/<executable_name>/inc/", // up 4 in tree, "/src/<executable_name>/inc/" subdir
|
||||||
|
"../../../../sandbox/<executable_name>/", // up 4 in tree, "/sandbox/<executable_name>/" subdir
|
||||||
|
"../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
|
||||||
|
"../../../../sandbox/<executable_name>/src/", // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
|
||||||
|
"../../../../sandbox/<executable_name>/inc/", // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
|
||||||
|
"../../../../0_Simple/<executable_name>/data/", // up 4 in tree, "/0_Simple/<executable_name>/" subdir
|
||||||
|
"../../../../1_Utilities/<executable_name>/data/", // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
|
||||||
|
"../../../../2_Graphics/<executable_name>/data/", // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
|
||||||
|
"../../../../3_Imaging/<executable_name>/data/", // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
|
||||||
|
"../../../../4_Finance/<executable_name>/data/", // up 4 in tree, "/4_Finance/<executable_name>/" subdir
|
||||||
|
"../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
|
||||||
|
"../../../../6_Advanced/<executable_name>/data/", // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
|
||||||
|
"../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
|
||||||
|
"../../../../8_Android/<executable_name>/data/", // up 4 in tree, "/8_Android/<executable_name>/" subdir
|
||||||
|
"../../../../0_Simple/<executable_name>/", // up 4 in tree, "/0_Simple/<executable_name>/" subdir
|
||||||
|
"../../../../1_Utilities/<executable_name>/", // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
|
||||||
|
"../../../../2_Graphics/<executable_name>/", // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
|
||||||
|
"../../../../3_Imaging/<executable_name>/", // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
|
||||||
|
"../../../../4_Finance/<executable_name>/", // up 4 in tree, "/4_Finance/<executable_name>/" subdir
|
||||||
|
"../../../../5_Simulations/<executable_name>/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
|
||||||
|
"../../../../6_Advanced/<executable_name>/", // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
|
||||||
|
"../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
|
||||||
|
"../../../../8_Android/<executable_name>/", // up 4 in tree, "/8_Android/<executable_name>/" subdir
|
||||||
|
"../../../../samples/<executable_name>/data/", // up 4 in tree, "/samples/<executable_name>/" subdir
|
||||||
|
"../../../../common/", // up 4 in tree, "../../../common/" subdir
|
||||||
|
"../../../../common/data/", // up 4 in tree, "../../../common/data/" subdir
|
||||||
|
"../../../../data/", // up 4 in tree, "../../../data/" subdir
|
||||||
|
"../../../../../", // up 5 in tree
|
||||||
|
"../../../../../src/<executable_name>/", // up 5 in tree, "/src/<executable_name>/" subdir
|
||||||
|
"../../../../../src/<executable_name>/data/", // up 5 in tree, "/src/<executable_name>/data/" subdir
|
||||||
|
"../../../../../src/<executable_name>/src/", // up 5 in tree, "/src/<executable_name>/src/" subdir
|
||||||
|
"../../../../../src/<executable_name>/inc/", // up 5 in tree, "/src/<executable_name>/inc/" subdir
|
||||||
|
"../../../../../sandbox/<executable_name>/", // up 5 in tree, "/sandbox/<executable_name>/" subdir
|
||||||
|
"../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
|
||||||
|
"../../../../../sandbox/<executable_name>/src/", // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
|
||||||
|
"../../../../../sandbox/<executable_name>/inc/", // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
|
||||||
|
"../../../../../0_Simple/<executable_name>/data/", // up 5 in tree, "/0_Simple/<executable_name>/" subdir
|
||||||
|
"../../../../../1_Utilities/<executable_name>/data/", // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
|
||||||
|
"../../../../../2_Graphics/<executable_name>/data/", // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
|
||||||
|
"../../../../../3_Imaging/<executable_name>/data/", // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
|
||||||
|
"../../../../../4_Finance/<executable_name>/data/", // up 5 in tree, "/4_Finance/<executable_name>/" subdir
|
||||||
|
"../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
|
||||||
|
"../../../../../6_Advanced/<executable_name>/data/", // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
|
||||||
|
"../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
|
||||||
|
"../../../../../8_Android/<executable_name>/data/", // up 5 in tree, "/8_Android/<executable_name>/" subdir
|
||||||
|
"../../../../../samples/<executable_name>/data/", // up 5 in tree, "/samples/<executable_name>/" subdir
|
||||||
|
"../../../../../common/", // up 5 in tree, "../../../common/" subdir
|
||||||
|
"../../../../../common/data/", // up 5 in tree, "../../../common/data/" subdir
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract the executable name
|
||||||
|
std::string executable_name;
|
||||||
|
|
||||||
|
if (executable_path != 0)
|
||||||
|
{
|
||||||
|
executable_name = std::string(executable_path);
|
||||||
|
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
// Windows path delimiter
|
||||||
|
size_t delimiter_pos = executable_name.find_last_of('\\');
|
||||||
|
executable_name.erase(0, delimiter_pos + 1);
|
||||||
|
|
||||||
|
if (executable_name.rfind(".exe") != std::string::npos)
|
||||||
|
{
|
||||||
|
// we strip .exe, only if the .exe is found
|
||||||
|
executable_name.resize(executable_name.size() - 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
// Linux & OSX path delimiter
|
||||||
|
size_t delimiter_pos = executable_name.find_last_of('/');
|
||||||
|
executable_name.erase(0,delimiter_pos+1);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loop over all search paths and return the first hit
|
||||||
|
for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
|
||||||
|
{
|
||||||
|
std::string path(searchPath[i]);
|
||||||
|
size_t executable_name_pos = path.find("<executable_name>");
|
||||||
|
|
||||||
|
// If there is executable_name variable in the searchPath
|
||||||
|
// replace it with the value
|
||||||
|
if (executable_name_pos != std::string::npos)
|
||||||
|
{
|
||||||
|
if (executable_path != 0)
|
||||||
|
{
|
||||||
|
path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Skip this path entry if no executable argument is given
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef _DEBUG
|
||||||
|
printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Test if the file exists
|
||||||
|
path.append(filename);
|
||||||
|
FILE *fp;
|
||||||
|
FOPEN(fp, path.c_str(), "rb");
|
||||||
|
|
||||||
|
if (fp != NULL)
|
||||||
|
{
|
||||||
|
fclose(fp);
|
||||||
|
// File found
|
||||||
|
// returning an allocated array here for backwards compatibility reasons
|
||||||
|
char *file_path = (char *) malloc(path.length() + 1);
|
||||||
|
STRCPY(file_path, path.length() + 1, path.c_str());
|
||||||
|
return file_path;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fp)
|
||||||
|
{
|
||||||
|
fclose(fp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// File not found
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -0,0 +1,499 @@
|
||||||
|
/**
|
||||||
|
* Copyright 1993-2013 NVIDIA Corporation. All rights reserved.
|
||||||
|
*
|
||||||
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||||
|
* with this source code for terms and conditions that govern your use of
|
||||||
|
* this software. Any use, reproduction, disclosure, or distribution of
|
||||||
|
* this software and related documentation outside the terms of the EULA
|
||||||
|
* is strictly prohibited.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Helper Timing Functions
|
||||||
|
#ifndef HELPER_TIMER_H
|
||||||
|
#define HELPER_TIMER_H
|
||||||
|
|
||||||
|
#ifndef EXIT_WAIVED
|
||||||
|
#define EXIT_WAIVED 2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// includes, system
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
// includes, project
|
||||||
|
#include <exception.h>
|
||||||
|
|
||||||
|
// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions
|
||||||
|
// But rather in a self contained class interface
|
||||||
|
class StopWatchInterface
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
StopWatchInterface() {};
|
||||||
|
virtual ~StopWatchInterface() {};
|
||||||
|
|
||||||
|
public:
|
||||||
|
//! Start time measurement
|
||||||
|
virtual void start() = 0;
|
||||||
|
|
||||||
|
//! Stop time measurement
|
||||||
|
virtual void stop() = 0;
|
||||||
|
|
||||||
|
//! Reset time counters to zero
|
||||||
|
virtual void reset() = 0;
|
||||||
|
|
||||||
|
//! Time in msec. after start. If the stop watch is still running (i.e. there
|
||||||
|
//! was no call to stop()) then the elapsed time is returned, otherwise the
|
||||||
|
//! time between the last start() and stop call is returned
|
||||||
|
virtual float getTime() = 0;
|
||||||
|
|
||||||
|
//! Mean time to date based on the number of times the stopwatch has been
|
||||||
|
//! _stopped_ (ie finished sessions) and the current total time
|
||||||
|
virtual float getAverageTime() = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
// Begin Stopwatch timer class definitions for all OS platforms //
|
||||||
|
//////////////////////////////////////////////////////////////////
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
// includes, system
|
||||||
|
#define WINDOWS_LEAN_AND_MEAN
|
||||||
|
#include <windows.h>
|
||||||
|
#undef min
|
||||||
|
#undef max
|
||||||
|
|
||||||
|
//! Windows specific implementation of StopWatch
|
||||||
|
class StopWatchWin : public StopWatchInterface
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
//! Constructor, default
|
||||||
|
StopWatchWin() :
|
||||||
|
start_time(), end_time(),
|
||||||
|
diff_time(0.0f), total_time(0.0f),
|
||||||
|
running(false), clock_sessions(0), freq(0), freq_set(false)
|
||||||
|
{
|
||||||
|
if (! freq_set)
|
||||||
|
{
|
||||||
|
// helper variable
|
||||||
|
LARGE_INTEGER temp;
|
||||||
|
|
||||||
|
// get the tick frequency from the OS
|
||||||
|
QueryPerformanceFrequency((LARGE_INTEGER *) &temp);
|
||||||
|
|
||||||
|
// convert to type in which it is needed
|
||||||
|
freq = ((double) temp.QuadPart) / 1000.0;
|
||||||
|
|
||||||
|
// rememeber query
|
||||||
|
freq_set = true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Destructor
|
||||||
|
~StopWatchWin() { };
|
||||||
|
|
||||||
|
public:
|
||||||
|
//! Start time measurement
|
||||||
|
inline void start();
|
||||||
|
|
||||||
|
//! Stop time measurement
|
||||||
|
inline void stop();
|
||||||
|
|
||||||
|
//! Reset time counters to zero
|
||||||
|
inline void reset();
|
||||||
|
|
||||||
|
//! Time in msec. after start. If the stop watch is still running (i.e. there
|
||||||
|
//! was no call to stop()) then the elapsed time is returned, otherwise the
|
||||||
|
//! time between the last start() and stop call is returned
|
||||||
|
inline float getTime();
|
||||||
|
|
||||||
|
//! Mean time to date based on the number of times the stopwatch has been
|
||||||
|
//! _stopped_ (ie finished sessions) and the current total time
|
||||||
|
inline float getAverageTime();
|
||||||
|
|
||||||
|
private:
|
||||||
|
// member variables
|
||||||
|
|
||||||
|
//! Start of measurement
|
||||||
|
LARGE_INTEGER start_time;
|
||||||
|
//! End of measurement
|
||||||
|
LARGE_INTEGER end_time;
|
||||||
|
|
||||||
|
//! Time difference between the last start and stop
|
||||||
|
float diff_time;
|
||||||
|
|
||||||
|
//! TOTAL time difference between starts and stops
|
||||||
|
float total_time;
|
||||||
|
|
||||||
|
//! flag if the stop watch is running
|
||||||
|
bool running;
|
||||||
|
|
||||||
|
//! Number of times clock has been started
|
||||||
|
//! and stopped to allow averaging
|
||||||
|
int clock_sessions;
|
||||||
|
|
||||||
|
//! tick frequency
|
||||||
|
double freq;
|
||||||
|
|
||||||
|
//! flag if the frequency has been set
|
||||||
|
bool freq_set;
|
||||||
|
};
|
||||||
|
|
||||||
|
// functions, inlined
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Start time measurement
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline void
|
||||||
|
StopWatchWin::start()
|
||||||
|
{
|
||||||
|
QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
|
||||||
|
running = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Stop time measurement and increment add to the current diff_time summation
|
||||||
|
//! variable. Also increment the number of times this clock has been run.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline void
|
||||||
|
StopWatchWin::stop()
|
||||||
|
{
|
||||||
|
QueryPerformanceCounter((LARGE_INTEGER *) &end_time);
|
||||||
|
diff_time = (float)
|
||||||
|
(((double) end_time.QuadPart - (double) start_time.QuadPart) / freq);
|
||||||
|
|
||||||
|
total_time += diff_time;
|
||||||
|
clock_sessions++;
|
||||||
|
running = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Reset the timer to 0. Does not change the timer running state but does
|
||||||
|
//! recapture this point in time as the current start time if it is running.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline void
|
||||||
|
StopWatchWin::reset()
|
||||||
|
{
|
||||||
|
diff_time = 0;
|
||||||
|
total_time = 0;
|
||||||
|
clock_sessions = 0;
|
||||||
|
|
||||||
|
if (running)
|
||||||
|
{
|
||||||
|
QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Time in msec. after start. If the stop watch is still running (i.e. there
|
||||||
|
//! was no call to stop()) then the elapsed time is returned added to the
|
||||||
|
//! current diff_time sum, otherwise the current summed time difference alone
|
||||||
|
//! is returned.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline float
|
||||||
|
StopWatchWin::getTime()
|
||||||
|
{
|
||||||
|
// Return the TOTAL time to date
|
||||||
|
float retval = total_time;
|
||||||
|
|
||||||
|
if (running)
|
||||||
|
{
|
||||||
|
LARGE_INTEGER temp;
|
||||||
|
QueryPerformanceCounter((LARGE_INTEGER *) &temp);
|
||||||
|
retval += (float)
|
||||||
|
(((double)(temp.QuadPart - start_time.QuadPart)) / freq);
|
||||||
|
}
|
||||||
|
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Time in msec. for a single run based on the total number of COMPLETED runs
|
||||||
|
//! and the total time.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline float
|
||||||
|
StopWatchWin::getAverageTime()
|
||||||
|
{
|
||||||
|
return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// Declarations for Stopwatch on Linux and Mac OSX
|
||||||
|
// includes, system
|
||||||
|
#include <ctime>
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
//! Windows specific implementation of StopWatch
|
||||||
|
class StopWatchLinux : public StopWatchInterface
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
//! Constructor, default
|
||||||
|
StopWatchLinux() :
|
||||||
|
start_time(), diff_time(0.0), total_time(0.0),
|
||||||
|
running(false), clock_sessions(0)
|
||||||
|
{ };
|
||||||
|
|
||||||
|
// Destructor
|
||||||
|
virtual ~StopWatchLinux()
|
||||||
|
{ };
|
||||||
|
|
||||||
|
public:
|
||||||
|
//! Start time measurement
|
||||||
|
inline void start();
|
||||||
|
|
||||||
|
//! Stop time measurement
|
||||||
|
inline void stop();
|
||||||
|
|
||||||
|
//! Reset time counters to zero
|
||||||
|
inline void reset();
|
||||||
|
|
||||||
|
//! Time in msec. after start. If the stop watch is still running (i.e. there
|
||||||
|
//! was no call to stop()) then the elapsed time is returned, otherwise the
|
||||||
|
//! time between the last start() and stop call is returned
|
||||||
|
inline float getTime();
|
||||||
|
|
||||||
|
//! Mean time to date based on the number of times the stopwatch has been
|
||||||
|
//! _stopped_ (ie finished sessions) and the current total time
|
||||||
|
inline float getAverageTime();
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
// helper functions
|
||||||
|
|
||||||
|
//! Get difference between start time and current time
|
||||||
|
inline float getDiffTime();
|
||||||
|
|
||||||
|
private:
|
||||||
|
|
||||||
|
// member variables
|
||||||
|
|
||||||
|
//! Start of measurement
|
||||||
|
struct timeval start_time;
|
||||||
|
|
||||||
|
//! Time difference between the last start and stop
|
||||||
|
float diff_time;
|
||||||
|
|
||||||
|
//! TOTAL time difference between starts and stops
|
||||||
|
float total_time;
|
||||||
|
|
||||||
|
//! flag if the stop watch is running
|
||||||
|
bool running;
|
||||||
|
|
||||||
|
//! Number of times clock has been started
|
||||||
|
//! and stopped to allow averaging
|
||||||
|
int clock_sessions;
|
||||||
|
};
|
||||||
|
|
||||||
|
// functions, inlined
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Start time measurement
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline void
|
||||||
|
StopWatchLinux::start()
|
||||||
|
{
|
||||||
|
gettimeofday(&start_time, 0);
|
||||||
|
running = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Stop time measurement and increment add to the current diff_time summation
|
||||||
|
//! variable. Also increment the number of times this clock has been run.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline void
|
||||||
|
StopWatchLinux::stop()
|
||||||
|
{
|
||||||
|
diff_time = getDiffTime();
|
||||||
|
total_time += diff_time;
|
||||||
|
running = false;
|
||||||
|
clock_sessions++;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Reset the timer to 0. Does not change the timer running state but does
|
||||||
|
//! recapture this point in time as the current start time if it is running.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline void
|
||||||
|
StopWatchLinux::reset()
|
||||||
|
{
|
||||||
|
diff_time = 0;
|
||||||
|
total_time = 0;
|
||||||
|
clock_sessions = 0;
|
||||||
|
|
||||||
|
if (running)
|
||||||
|
{
|
||||||
|
gettimeofday(&start_time, 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Time in msec. after start. If the stop watch is still running (i.e. there
|
||||||
|
//! was no call to stop()) then the elapsed time is returned added to the
|
||||||
|
//! current diff_time sum, otherwise the current summed time difference alone
|
||||||
|
//! is returned.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline float
|
||||||
|
StopWatchLinux::getTime()
|
||||||
|
{
|
||||||
|
// Return the TOTAL time to date
|
||||||
|
float retval = total_time;
|
||||||
|
|
||||||
|
if (running)
|
||||||
|
{
|
||||||
|
retval += getDiffTime();
|
||||||
|
}
|
||||||
|
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Time in msec. for a single run based on the total number of COMPLETED runs
|
||||||
|
//! and the total time.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline float
|
||||||
|
StopWatchLinux::getAverageTime()
|
||||||
|
{
|
||||||
|
return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
|
||||||
|
}
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline float
|
||||||
|
StopWatchLinux::getDiffTime()
|
||||||
|
{
|
||||||
|
struct timeval t_time;
|
||||||
|
gettimeofday(&t_time, 0);
|
||||||
|
|
||||||
|
// time difference in milli-seconds
|
||||||
|
return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec)
|
||||||
|
+ (0.001 * (t_time.tv_usec - start_time.tv_usec)));
|
||||||
|
}
|
||||||
|
#endif // WIN32
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Timer functionality exported
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Create a new timer
|
||||||
|
//! @return true if a time has been created, otherwise false
|
||||||
|
//! @param name of the new timer, 0 if the creation failed
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline bool
|
||||||
|
sdkCreateTimer(StopWatchInterface **timer_interface)
|
||||||
|
{
|
||||||
|
//printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
|
||||||
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
|
*timer_interface = (StopWatchInterface *)new StopWatchWin();
|
||||||
|
#else
|
||||||
|
*timer_interface = (StopWatchInterface *)new StopWatchLinux();
|
||||||
|
#endif
|
||||||
|
return (*timer_interface != NULL) ? true : false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Delete a timer
|
||||||
|
//! @return true if a time has been deleted, otherwise false
|
||||||
|
//! @param name of the timer to delete
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline bool
|
||||||
|
sdkDeleteTimer(StopWatchInterface **timer_interface)
|
||||||
|
{
|
||||||
|
//printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
|
||||||
|
if (*timer_interface)
|
||||||
|
{
|
||||||
|
delete *timer_interface;
|
||||||
|
*timer_interface = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Start the time with name \a name
|
||||||
|
//! @param name name of the timer to start
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline bool
|
||||||
|
sdkStartTimer(StopWatchInterface **timer_interface)
|
||||||
|
{
|
||||||
|
//printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
|
||||||
|
if (*timer_interface)
|
||||||
|
{
|
||||||
|
(*timer_interface)->start();
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Stop the time with name \a name. Does not reset.
|
||||||
|
//! @param name name of the timer to stop
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline bool
|
||||||
|
sdkStopTimer(StopWatchInterface **timer_interface)
|
||||||
|
{
|
||||||
|
// printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
|
||||||
|
if (*timer_interface)
|
||||||
|
{
|
||||||
|
(*timer_interface)->stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Resets the timer's counter.
|
||||||
|
//! @param name name of the timer to reset.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline bool
|
||||||
|
sdkResetTimer(StopWatchInterface **timer_interface)
|
||||||
|
{
|
||||||
|
// printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
|
||||||
|
if (*timer_interface)
|
||||||
|
{
|
||||||
|
(*timer_interface)->reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Return the average time for timer execution as the total time
|
||||||
|
//! for the timer dividied by the number of completed (stopped) runs the timer
|
||||||
|
//! has made.
|
||||||
|
//! Excludes the current running time if the timer is currently running.
|
||||||
|
//! @param name name of the timer to return the time of
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline float
|
||||||
|
sdkGetAverageTimerValue(StopWatchInterface **timer_interface)
|
||||||
|
{
|
||||||
|
// printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface);
|
||||||
|
if (*timer_interface)
|
||||||
|
{
|
||||||
|
return (*timer_interface)->getAverageTime();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
//! Total execution time for the timer over all runs since the last reset
|
||||||
|
//! or timer creation.
|
||||||
|
//! @param name name of the timer to obtain the value of.
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
inline float
|
||||||
|
sdkGetTimerValue(StopWatchInterface **timer_interface)
|
||||||
|
{
|
||||||
|
// printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
|
||||||
|
if (*timer_interface)
|
||||||
|
{
|
||||||
|
return (*timer_interface)->getTime();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // HELPER_TIMER_H
|
Loading…
Reference in New Issue