Merge branch 'next' of https://github.com/Arribas/gnss-sdr into next

Working with GPUs
2025-10-18 17:17:42 +00:00 · 2015-08-25 15:54:02 +02:00
parent f65c87c1c4 7b57bd28f8
commit e38cb40d4f
30 changed files with 8089 additions and 25 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -955,6 +955,20 @@ else(ENABLE_OSMOSDR)
    message(STATUS "Enable it with 'cmake -DENABLE_OSMOSDR=ON ../' to add support for OsmoSDR and other front-ends (HackRF, bladeRF, Realtek's RTL2832U-based USB dongles, etc.)" )
 endif(ENABLE_OSMOSDR)

+if($ENV{CUDA_GPU_ACCEL})
+    message(STATUS "CUDA_GPU_ACCEL environment variable found." )
+    set(ENABLE_CUDA ON)
+endif($ENV{CUDA_GPU_ACCEL})
+
+if(ENABLE_CUDA)
+    message(STATUS "NVIDIA CUDA GPU Acceleration will be enabled." )
+    message(STATUS "You can disable it with 'cmake -DENABLE_CUDA=OFF ../'" )
+else(ENABLE_CUDA)
+    message(STATUS "NVIDIA CUDA GPU Acceleration will is not enabled." )
+    message(STATUS "Enable it with 'cmake -DENABLE_CUDA=ON ../' to add support for the Teleorbit Flexiband front-end." )
+endif(ENABLE_CUDA)
+
+
 if($ENV{FLEXIBAND_DRIVER})
    message(STATUS "FLEXIBAND_DRIVER environment variable found." )
    set(ENABLE_FLEXIBAND ON)
--- a/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf
+++ b/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf
@@ -0,0 +1,305 @@
+; Default configuration file
+; You can define your own receiver and invoke it by doing
+; gnss-sdr --config_file=my_GNSS_SDR_configuration.conf
+;
+
+[GNSS-SDR]
+
+;######### GLOBAL OPTIONS ##################
+;internal_fs_hz: Internal signal sampling frequency after the signal conditioning stage [Hz].
+GNSS-SDR.internal_fs_hz=4000000
+
+;######### CONTROL_THREAD CONFIG ############
+ControlThread.wait_for_flowgraph=false
+
+;######### SIGNAL_SOURCE CONFIG ############
+;#implementation: Use [File_Signal_Source] or [UHD_Signal_Source] or [GN3S_Signal_Source] (experimental)
+SignalSource.implementation=File_Signal_Source
+
+;#filename: path to file with the captured GNSS signal samples to be processed
+SignalSource.filename=/home/javier/signals/4msps.dat
+
+;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version.
+SignalSource.item_type=gr_complex
+
+;#sampling_frequency: Original Signal sampling frequency in [Hz] 
+SignalSource.sampling_frequency=4000000
+
+;#freq: RF front-end center frequency in [Hz] 
+SignalSource.freq=1575420000
+
+;#gain: Front-end Gain in [dB] 
+SignalSource.gain=60
+
+;#subdevice: UHD subdevice specification (for USRP1 use A:0 or B:0)
+SignalSource.subdevice=B:0
+
+;#samples: Number of samples to be processed. Notice that 0 indicates the entire file.
+SignalSource.samples=0
+
+;#repeat: Repeat the processing file. Disable this option in this version
+SignalSource.repeat=false
+
+;#dump: Dump the Signal source data to a file. Disable this option in this version
+SignalSource.dump=false
+
+SignalSource.dump_filename=../data/signal_source.dat
+
+
+;#enable_throttle_control: Enabling this option tells the signal source to keep the delay between samples in post processing.
+; it helps to not overload the CPU, but the processing time will be longer. 
+SignalSource.enable_throttle_control=false
+
+
+;######### SIGNAL_CONDITIONER CONFIG ############
+;## It holds blocks to change data type, filter and resample input data. 
+
+;#implementation: Use [Pass_Through] or [Signal_Conditioner]
+;#[Pass_Through] disables this block and the [DataTypeAdapter], [InputFilter] and [Resampler] blocks
+;#[Signal_Conditioner] enables this block. Then you have to configure [DataTypeAdapter], [InputFilter] and [Resampler] blocks
+;SignalConditioner.implementation=Signal_Conditioner
+SignalConditioner.implementation=Pass_Through
+
+;######### DATA_TYPE_ADAPTER CONFIG ############
+;## Changes the type of input data. Please disable it in this version.
+;#implementation: [Pass_Through] disables this block
+DataTypeAdapter.implementation=Pass_Through
+
+;######### INPUT_FILTER CONFIG ############
+;## Filter the input data. Can be combined with frequency translation for IF signals
+
+;#implementation: Use [Pass_Through] or [Fir_Filter] or [Freq_Xlating_Fir_Filter]
+;#[Pass_Through] disables this block
+;#[Fir_Filter] enables a FIR Filter
+;#[Freq_Xlating_Fir_Filter] enables FIR filter and a composite frequency translation that shifts IF down to zero Hz.
+
+;InputFilter.implementation=Fir_Filter
+;InputFilter.implementation=Freq_Xlating_Fir_Filter
+InputFilter.implementation=Pass_Through
+
+;#dump: Dump the filtered data to a file.
+InputFilter.dump=false
+
+;#dump_filename: Log path and filename.
+InputFilter.dump_filename=../data/input_filter.dat
+
+;#The following options are used in the filter design of Fir_Filter and Freq_Xlating_Fir_Filter implementation. 
+;#These options are based on parameters of gnuradio's function: gr_remez.
+;#These function calculates the optimal (in the Chebyshev/minimax sense) FIR filter inpulse reponse given a set of band edges, the desired reponse on those bands, and the weight given to the error in those bands.
+
+;#input_item_type: Type and resolution for input signal samples. Use only gr_complex in this version.
+InputFilter.input_item_type=gr_complex
+
+;#outut_item_type: Type and resolution for output filtered signal samples. Use only gr_complex in this version.
+InputFilter.output_item_type=gr_complex
+
+;#taps_item_type: Type and resolution for the taps of the filter. Use only float in this version.
+InputFilter.taps_item_type=float
+
+;#number_of_taps: Number of taps in the filter. Increasing this parameter increases the processing time
+InputFilter.number_of_taps=5
+
+;#number_of _bands: Number of frequency bands in the filter.
+InputFilter.number_of_bands=2
+
+;#bands: frequency at the band edges [ b1 e1 b2 e2 b3 e3 ...].
+;#Frequency is in the range [0, 1], with 1 being the Nyquist frequency (Fs/2)
+;#The number of band_begin and band_end elements must match the number of bands
+
+InputFilter.band1_begin=0.0
+InputFilter.band1_end=0.45
+InputFilter.band2_begin=0.55
+InputFilter.band2_end=1.0
+
+;#ampl: desired amplitude at the band edges [ a(b1) a(e1) a(b2) a(e2) ...].
+;#The number of ampl_begin and ampl_end elements must match the number of bands
+
+InputFilter.ampl1_begin=1.0
+InputFilter.ampl1_end=1.0
+InputFilter.ampl2_begin=0.0
+InputFilter.ampl2_end=0.0
+
+;#band_error: weighting applied to each band (usually 1).
+;#The number of band_error elements must match the number of bands
+InputFilter.band1_error=1.0
+InputFilter.band2_error=1.0
+
+;#filter_type: one of "bandpass", "hilbert" or "differentiator" 
+InputFilter.filter_type=bandpass
+
+;#grid_density: determines how accurately the filter will be constructed.
+;The minimum value is 16; higher values are slower to compute the filter.
+InputFilter.grid_density=16
+
+;#The following options are used only in Freq_Xlating_Fir_Filter implementation.
+;#InputFilter.IF is the intermediate frequency (in Hz) shifted down to zero Hz
+
+InputFilter.sampling_frequency=4000000
+InputFilter.IF=0
+
+
+
+;######### RESAMPLER CONFIG ############
+;## Resamples the input data. 
+
+;#implementation: Use [Pass_Through] or [Direct_Resampler]
+;#[Pass_Through] disables this block
+;#[Direct_Resampler] enables a resampler that implements a nearest neigbourhood interpolation
+;Resampler.implementation=Direct_Resampler
+Resampler.implementation=Pass_Through
+
+;#dump: Dump the resamplered data to a file.
+Resampler.dump=false
+;#dump_filename: Log path and filename.
+Resampler.dump_filename=../data/resampler.dat
+
+;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version.
+Resampler.item_type=gr_complex
+
+;#sample_freq_in: the sample frequency of the input signal
+Resampler.sample_freq_in=8000000
+
+;#sample_freq_out: the desired sample frequency of the output signal
+Resampler.sample_freq_out=4000000
+
+
+;######### CHANNELS GLOBAL CONFIG ############
+;#count: Number of available GPS satellite channels.
+Channels_GPS.count=1
+;#count: Number of available Galileo satellite channels.
+Channels_Galileo.count=0
+;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver
+Channels.in_acquisition=1
+;#system: GPS, GLONASS, GALILEO, SBAS or COMPASS
+;#if the option is disabled by default is assigned GPS
+Channel.system=GPS
+
+;#if the option is disabled by default is assigned "1C" GPS L1 C/A
+Channel.signal=1C
+
+
+;######### SPECIFIC CHANNELS CONFIG ######
+;#The following options are specific to each channel and overwrite the generic options
+
+;######### CHANNEL 0 CONFIG ############
+
+;Channel0.system=GPS
+;Channel0.signal=1C
+
+;#satellite: Satellite PRN ID for this channel. Disable this option to random search
+;Channel0.satellite=11
+
+;######### CHANNEL 1 CONFIG ############
+
+;Channel1.system=GPS
+;Channel1.signal=1C
+;Channel1.satellite=18
+
+;######### ACQUISITION GLOBAL CONFIG ############
+
+;#dump: Enable or disable the acquisition internal data file logging [true] or [false] 
+Acquisition_GPS.dump=false
+;#filename: Log path and filename
+Acquisition_GPS.dump_filename=./acq_dump.dat
+;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version.
+Acquisition_GPS.item_type=gr_complex
+;#if: Signal intermediate frequency in [Hz] 
+Acquisition_GPS.if=0
+;#sampled_ms: Signal block duration for the acquisition signal detection [ms]
+Acquisition_GPS.sampled_ms=1
+;#implementation: Acquisition algorithm selection for this channel: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition]
+Acquisition_GPS.implementation=GPS_L1_CA_PCPS_Acquisition
+;#threshold: Acquisition threshold
+Acquisition_GPS.threshold=0.005
+;#pfa: Acquisition false alarm probability. This option overrides the threshold option. Only use with implementations: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition] 
+;Acquisition_GPS.pfa=0.01
+;#doppler_max: Maximum expected Doppler shift [Hz]
+Acquisition_GPS.doppler_max=10000
+;#doppler_max: Doppler step in the grid search [Hz]
+Acquisition_GPS.doppler_step=500
+
+;######### TRACKING GLOBAL CONFIG ############
+
+;#implementation: Selected tracking algorithm: [GPS_L1_CA_DLL_PLL_Tracking] or [GPS_L1_CA_DLL_FLL_PLL_Tracking] or [GPS_L1_CA_TCP_CONNECTOR_Tracking] or [Galileo_E1_DLL_PLL_VEML_Tracking]
+Tracking_GPS.implementation=GPS_L1_CA_DLL_PLL_Tracking_GPU
+;#item_type: Type and resolution for each of the signal samples. Use only [gr_complex] in this version.
+Tracking_GPS.item_type=gr_complex
+
+;#sampling_frequency: Signal Intermediate Frequency in [Hz] 
+Tracking_GPS.if=0
+
+;#dump: Enable or disable the Tracking internal binary data file logging [true] or [false] 
+Tracking_GPS.dump=true
+
+;#dump_filename: Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number.
+Tracking_GPS.dump_filename=../data/epl_tracking_ch_
+
+;#pll_bw_hz: PLL loop filter bandwidth [Hz]
+Tracking_GPS.pll_bw_hz=55.0;
+
+;#dll_bw_hz: DLL loop filter bandwidth [Hz]
+Tracking_GPS.dll_bw_hz=1.5
+
+;#fll_bw_hz: FLL loop filter bandwidth [Hz]
+Tracking_GPS.fll_bw_hz=10.0;
+
+;#order: PLL/DLL loop filter order [2] or [3]
+Tracking_GPS.order=3;
+
+;######### TELEMETRY DECODER GPS CONFIG ############
+;#implementation: Use [GPS_L1_CA_Telemetry_Decoder] for GPS L1 C/A
+TelemetryDecoder_GPS.implementation=GPS_L1_CA_Telemetry_Decoder
+TelemetryDecoder_GPS.dump=false
+;#decimation factor
+TelemetryDecoder_GPS.decimation_factor=1;
+
+;######### OBSERVABLES CONFIG ############
+;#implementation: Use [GPS_L1_CA_Observables] for GPS L1 C/A.
+Observables.implementation=GPS_L1_CA_Observables
+
+;#dump: Enable or disable the Observables internal binary data file logging [true] or [false]
+Observables.dump=false
+
+;#dump_filename: Log path and filename.
+Observables.dump_filename=./observables.dat
+
+
+;######### PVT CONFIG ############
+;#implementation: Position Velocity and Time (PVT) implementation algorithm: Use [GPS_L1_CA_PVT] in this version.
+PVT.implementation=GPS_L1_CA_PVT
+
+;#averaging_depth: Number of PVT observations in the moving average algorithm
+PVT.averaging_depth=100
+
+;#flag_average: Enables the PVT averaging between output intervals (arithmetic mean) [true] or [false]
+PVT.flag_averaging=false
+
+;#output_rate_ms: Period between two PVT outputs. Notice that the minimum period is equal to the tracking integration time (for GPS CA L1 is 1ms) [ms]
+PVT.output_rate_ms=10
+
+;#display_rate_ms: Position console print (std::out) interval [ms]. Notice that output_rate_ms<=display_rate_ms.
+PVT.display_rate_ms=500
+
+;# RINEX, KML, and NMEA output configuration
+
+;#dump_filename: Log path and filename without extension. Notice that PVT will add ".dat" to the binary dump and ".kml" to GoogleEarth dump.
+PVT.dump_filename=./PVT
+
+;#nmea_dump_filename: NMEA log path and filename
+PVT.nmea_dump_filename=./gnss_sdr_pvt.nmea;
+
+;#flag_nmea_tty_port: Enable or disable the NMEA log to a serial TTY port (Can be used with real hardware or virtual one)
+PVT.flag_nmea_tty_port=false;
+
+;#nmea_dump_devname: serial device descriptor for NMEA logging
+PVT.nmea_dump_devname=/dev/pts/4
+
+
+;#dump: Enable or disable the PVT internal binary data file logging [true] or [false]
+PVT.dump=false
+
+;######### OUTPUT_FILTER CONFIG ############
+;# Receiver output filter: Leave this block disabled in this version
+OutputFilter.implementation=Null_Sink_Output_Filter
+OutputFilter.filename=data/gnss-sdr.dat
+OutputFilter.item_type=gr_complex
--- a/conf/gnss-sdr_multichannel_GPS_L1_Flexiband_bin_file_III_1a.conf
+++ b/conf/gnss-sdr_multichannel_GPS_L1_Flexiband_bin_file_III_1a.conf
@@ -29,13 +29,13 @@ GNSS-SDR.SUPL_CI=0x31b0
 SignalSource.implementation=Flexiband_Signal_Source

 SignalSource.flag_read_file=true
-SignalSource.signal_file=/datalogger/captures/eclipse/eclipse_IIIa_2.bin
+SignalSource.signal_file=/datalogger/L125_III1b_210s.usb

 ;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version.
 SignalSource.item_type=gr_complex

 ;# FPGA firmware file
-SignalSource.firmware_file=flexiband_III-1a.bit
+SignalSource.firmware_file=flexiband_III-1b.bit

 ;#RF_channels: Number of RF channels present in the frontend device, must agree the FPGA firmware file
 SignalSource.RF_channels=1
--- a/conf/gnss-sdr_multichannel_GPS_L1_L2_Galileo_E1B_Flexiband_realtime_III_1b.conf
+++ b/conf/gnss-sdr_multichannel_GPS_L1_L2_Galileo_E1B_Flexiband_realtime_III_1b.conf
@@ -28,9 +28,9 @@ GNSS-SDR.SUPL_CI=0x31b0
 ;#implementation: Use [File_Signal_Source] or [UHD_Signal_Source] or [GN3S_Signal_Source] (experimental)
 SignalSource.implementation=Flexiband_Signal_Source

-SignalSource.flag_read_file=false
-#SignalSource.signal_file=/datalogger/signals/Fraunhofer/L125_III1b_210s.usb
-SignalSource.signal_file=/datalogger/captures/flexiband_III_1b_cap1.usb
+SignalSource.flag_read_file=true
+SignalSource.signal_file=/datalogger/L125_III1b_210s.usb
+#SignalSource.signal_file=/datalogger/captures/flexiband_III_1b_cap1.usb

 ;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version.
 SignalSource.item_type=gr_complex
@@ -136,8 +136,8 @@ InputFilter0.grid_density=16
 InputFilter0.sampling_frequency=20000000
 ;# IF deviation due to front-end LO inaccuracies [HZ]
 ;# WARNING: Fraunhofer front-end hardwareconfigurations can difer. Signals available on http://www.iis.fraunhofer.de/de/ff/lok/leist/test/flexiband.html are centered on 0 Hz, ALL BANDS.
-InputFilter0.IF=-205000
-;#InputFilter0.IF=0
+;#InputFilter0.IF=-205000
+InputFilter0.IF=0

 ;# Decimation factor after the frequency tranaslating block
 InputFilter0.decimation_factor=8
@@ -230,8 +230,8 @@ InputFilter1.grid_density=16
 InputFilter1.sampling_frequency=20000000
 ;# IF deviation due to front-end LO inaccuracies [HZ]
 ;# WARNING: Fraunhofer front-end hardwareconfigurations can difer. Signals available on http://www.iis.fraunhofer.de/de/ff/lok/leist/test/flexiband.html are centered on 0 Hz, ALL BANDS.
-InputFilter1.IF=100000
-;#InputFilter1.IF=0
+;#InputFilter1.IF=100000
+InputFilter1.IF=0

 ;# Decimation factor after the frequency tranaslating block
 InputFilter1.decimation_factor=8
@@ -272,7 +272,7 @@ Resampler2.implementation=Pass_Through
 ;#count: Number of available GPS satellite channels.
 Channels_1C.count=8
 Channels_1B.count=1
-Channels_2S.count=8
+Channels_2S.count=1
 ;#count: Number of available Galileo satellite channels.
 ;Channels_Galileo.count=0
 ;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver
@@ -378,13 +378,13 @@ Acquisition_1C.max_dwells=1

 ;#implementation: Selected tracking algorithm: [GPS_L1_CA_DLL_PLL_Tracking] or [GPS_L1_CA_DLL_FLL_PLL_Tracking]

-Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_Tracking
+Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_Tracking_GPU
 Tracking_1C.item_type=gr_complex
 Tracking_1C.if=0
-Tracking_1C.dump=true
-Tracking_1C.dump_filename=./tracking_ch_
+Tracking_1C.dump=false
+Tracking_1C.dump_filename=../data/epl_tracking_ch_
 Tracking_1C.pll_bw_hz=40.0;
-Tracking_1C.dll_bw_hz=3.0;
+Tracking_1C.dll_bw_hz=1.5;
 Tracking_1C.fll_bw_hz=10.0;
 Tracking_1C.order=3;
 Tracking_1C.early_late_space_chips=0.5;
@@ -405,7 +405,7 @@ Acquisition_2S.max_dwells=1
 Tracking_2S.implementation=GPS_L2_M_DLL_PLL_Tracking
 Tracking_2S.item_type=gr_complex
 Tracking_2S.if=0
-Tracking_2S.dump=true
+Tracking_2S.dump=false
 Tracking_2S.dump_filename=./tracking_ch_
 Tracking_2S.pll_bw_hz=1.5;
 Tracking_2S.dll_bw_hz=0.3;
@@ -447,7 +447,7 @@ Tracking_1B.item_type=gr_complex
 Tracking_1B.if=0

 ;#dump: Enable or disable the Tracking internal binary data file logging [true] or [false] 
-Tracking_1B.dump=true
+Tracking_1B.dump=false

 ;#dump_filename: Log path and filename. Notice that the tracking channel will add "x.dat" where x is the channel number.
 Tracking_1B.dump_filename=./veml_tracking_ch_
@@ -497,7 +497,7 @@ TelemetryDecoder_1B.decimation_factor=5;
 Observables.implementation=Mixed_Observables

 ;#dump: Enable or disable the Observables internal binary data file logging [true] or [false] 
-Observables.dump=true
+Observables.dump=false

 ;#dump_filename: Log path and filename.
 Observables.dump_filename=./observables.dat
--- a/conf/gnss-sdr_multichannel_GPS_L2_M_Flexiband_bin_file_III_1a.conf
+++ b/conf/gnss-sdr_multichannel_GPS_L2_M_Flexiband_bin_file_III_1a.conf
@@ -135,7 +135,8 @@ InputFilter0.grid_density=16
 ; i.e. using front-end-cal as reported here:http://www.cttc.es/publication/turning-a-television-into-a-gnss-receiver/
 InputFilter0.sampling_frequency=20000000
 ;# IF deviation due to front-end LO inaccuracies [HZ]
-InputFilter0.IF=-205000
+;#InputFilter0.IF=-205000
+InputFilter0.IF=0

 ;# Decimation factor after the frequency tranaslating block
 InputFilter0.decimation_factor=4
--- a/src/algorithms/signal_source/adapters/CMakeLists.txt
+++ b/src/algorithms/signal_source/adapters/CMakeLists.txt
@@ -58,7 +58,7 @@ if(ENABLE_FLEXIBAND)
     if(OS_IS_MACOSX)
          set(MACOSX_ARGS "-DCMAKE_CXX_COMPILER=/usr/bin/clang++")
     endif(OS_IS_MACOSX)
-    find_package(teleorbit REQUIRED)
+    find_package(Teleorbit REQUIRED)
    if(NOT TELEORBIT_FOUND)
        message(FATAL_ERROR "Teleorbit Flexiband GNURadio driver required to build gnss-sdr with the optional FLEXIBAND adapter")
    endif(NOT TELEORBIT_FOUND)
--- a/src/algorithms/tracking/adapters/CMakeLists.txt
+++ b/src/algorithms/tracking/adapters/CMakeLists.txt
@@ -16,6 +16,10 @@
 # along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
 #

+if(ENABLE_CUDA)
+	FIND_PACKAGE(CUDA REQUIRED) 
+	set(OPT_TRACKING_ADAPTERS ${OPT_TRACKING_ADAPTERS} gps_l1_ca_dll_pll_tracking_gpu.cc)
+endif(ENABLE_CUDA)

 set(TRACKING_ADAPTER_SOURCES 
     galileo_e1_dll_pll_veml_tracking.cc
@@ -27,6 +31,7 @@ set(TRACKING_ADAPTER_SOURCES
     gps_l1_ca_tcp_connector_tracking.cc
     galileo_e5a_dll_pll_tracking.cc
     gps_l2_m_dll_pll_tracking.cc
+     ${OPT_TRACKING_ADAPTERS}
 )

 include_directories(
@@ -40,6 +45,7 @@ include_directories(
     ${GLOG_INCLUDE_DIRS}
     ${GFlags_INCLUDE_DIRS}
     ${GNURADIO_RUNTIME_INCLUDE_DIRS}
+     ${CUDA_INCLUDE_DIRS}
 )

 file(GLOB TRACKING_ADAPTER_HEADERS "*.h")
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.cc
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.cc
@@ -0,0 +1,158 @@
+/*!
+ * \file gps_l1_ca_dll_pll_tracking_gpu.cc
+ * \brief Implementation of an adapter of a DLL+PLL tracking loop block using GPU accelerated functions
+ * for GPS L1 C/A to a TrackingInterface
+ * \author Javier Arribas, 2015. jarribas(at)cttc.es
+ *
+ * Code DLL + carrier PLL according to the algorithms described in:
+ * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
+ * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
+ * Approach, Birkhauser, 2007
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+
+#include "gps_l1_ca_dll_pll_tracking_gpu.h"
+#include <glog/logging.h>
+#include "GPS_L1_CA.h"
+#include "configuration_interface.h"
+
+
+using google::LogMessage;
+
+GpsL1CaDllPllTrackingGPU::GpsL1CaDllPllTrackingGPU(
+        ConfigurationInterface* configuration, std::string role,
+        unsigned int in_streams, unsigned int out_streams,
+        boost::shared_ptr<gr::msg_queue> queue) :
+                role_(role), in_streams_(in_streams), out_streams_(out_streams),
+                queue_(queue)
+{
+    DLOG(INFO) << "role " << role;
+    //################# CONFIGURATION PARAMETERS ########################
+    int fs_in;
+    int vector_length;
+    int f_if;
+    bool dump;
+    std::string dump_filename;
+    std::string item_type;
+    std::string default_item_type = "gr_complex";
+    float pll_bw_hz;
+    float dll_bw_hz;
+    float early_late_space_chips;
+    item_type = configuration->property(role + ".item_type", default_item_type);
+    //vector_length = configuration->property(role + ".vector_length", 2048);
+    fs_in = configuration->property("GNSS-SDR.internal_fs_hz", 2048000);
+    f_if = configuration->property(role + ".if", 0);
+    dump = configuration->property(role + ".dump", false);
+    pll_bw_hz = configuration->property(role + ".pll_bw_hz", 50.0);
+    dll_bw_hz = configuration->property(role + ".dll_bw_hz", 2.0);
+    early_late_space_chips = configuration->property(role + ".early_late_space_chips", 0.5);
+    std::string default_dump_filename = "./track_ch";
+    dump_filename = configuration->property(role + ".dump_filename",
+            default_dump_filename); //unused!
+    vector_length = std::round(fs_in / (GPS_L1_CA_CODE_RATE_HZ / GPS_L1_CA_CODE_LENGTH_CHIPS));
+
+    //################# MAKE TRACKING GNURadio object ###################
+    if (item_type.compare("gr_complex") == 0)
+        {
+            item_size_ = sizeof(gr_complex);
+            tracking_ = gps_l1_ca_dll_pll_make_tracking_gpu_cc(
+                    f_if,
+                    fs_in,
+                    vector_length,
+                    queue_,
+                    dump,
+                    dump_filename,
+                    pll_bw_hz,
+                    dll_bw_hz,
+                    early_late_space_chips);
+        }
+    else
+        {
+            item_size_ = sizeof(gr_complex);
+            LOG(WARNING) << item_type << " unknown tracking item type.";
+        }
+    channel_ = 0;
+    channel_internal_queue_ = 0;
+    DLOG(INFO) << "tracking(" << tracking_->unique_id() << ")";
+}
+
+
+GpsL1CaDllPllTrackingGPU::~GpsL1CaDllPllTrackingGPU()
+{}
+
+
+void GpsL1CaDllPllTrackingGPU::start_tracking()
+{
+    tracking_->start_tracking();
+}
+
+/*
+ * Set tracking channel unique ID
+ */
+void GpsL1CaDllPllTrackingGPU::set_channel(unsigned int channel)
+{
+    channel_ = channel;
+    tracking_->set_channel(channel);
+}
+
+/*
+ * Set tracking channel internal queue
+ */
+void GpsL1CaDllPllTrackingGPU::set_channel_queue(
+        concurrent_queue<int> *channel_internal_queue)
+{
+    channel_internal_queue_ = channel_internal_queue;
+    tracking_->set_channel_queue(channel_internal_queue_);
+}
+
+void GpsL1CaDllPllTrackingGPU::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro)
+{
+    tracking_->set_gnss_synchro(p_gnss_synchro);
+}
+
+void GpsL1CaDllPllTrackingGPU::connect(gr::top_block_sptr top_block)
+{
+	if(top_block) { /* top_block is not null */};
+	//nothing to connect, now the tracking uses gr_sync_decimator
+}
+
+void GpsL1CaDllPllTrackingGPU::disconnect(gr::top_block_sptr top_block)
+{
+	if(top_block) { /* top_block is not null */};
+	//nothing to disconnect, now the tracking uses gr_sync_decimator
+}
+
+gr::basic_block_sptr GpsL1CaDllPllTrackingGPU::get_left_block()
+{
+    return tracking_;
+}
+
+gr::basic_block_sptr GpsL1CaDllPllTrackingGPU::get_right_block()
+{
+    return tracking_;
+}
+
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
@@ -0,0 +1,113 @@
+/*!
+ * \file gps_l1_ca_dll_pll_tracking_gpu.h
+ * \brief Implementation of an adapter of a DLL+PLL tracking loop block using GPU accelerated functions
+ * for GPS L1 C/A to a TrackingInterface
+ * \author Javier Arribas, 2015. jarribas(at)cttc.es
+ *
+ * Code DLL + carrier PLL according to the algorithms described in:
+ * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
+ * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
+ * Approach, Birkha user, 2007
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_H_
+#define GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_H_
+
+#include <string>
+#include <gnuradio/msg_queue.h>
+#include "tracking_interface.h"
+#include "gps_l1_ca_dll_pll_tracking_gpu_cc.h"
+
+
+class ConfigurationInterface;
+
+/*!
+ * \brief This class implements a code DLL + carrier PLL tracking loop using GPU accelerated functions
+ */
+class GpsL1CaDllPllTrackingGPU : public TrackingInterface
+{
+public:
+
+  GpsL1CaDllPllTrackingGPU(ConfigurationInterface* configuration,
+            std::string role,
+            unsigned int in_streams,
+            unsigned int out_streams,
+            boost::shared_ptr<gr::msg_queue> queue);
+
+    virtual ~GpsL1CaDllPllTrackingGPU();
+
+    std::string role()
+    {
+        return role_;
+    }
+
+    //! Returns "GPS_L1_CA_DLL_PLL_Tracking"
+    std::string implementation()
+    {
+        return "GPS_L1_CA_DLL_PLL_Tracking_GPU";
+    }
+    size_t item_size()
+    {
+        return item_size_;
+    }
+
+    void connect(gr::top_block_sptr top_block);
+    void disconnect(gr::top_block_sptr top_block);
+    gr::basic_block_sptr get_left_block();
+    gr::basic_block_sptr get_right_block();
+
+
+    /*!
+     * \brief Set tracking channel unique ID
+     */
+    void set_channel(unsigned int channel);
+
+    /*!
+     * \brief Set acquisition/tracking common Gnss_Synchro object pointer
+     * to efficiently exchange synchronization data between acquisition and tracking blocks
+     */
+    void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro);
+
+    /*!
+     * \brief Set tracking channel internal queue
+     */
+    void set_channel_queue(concurrent_queue<int> *channel_internal_queue);
+
+    void start_tracking();
+
+private:
+    gps_l1_ca_dll_pll_tracking_gpu_cc_sptr tracking_;
+    size_t item_size_;
+    unsigned int channel_;
+    std::string role_;
+    unsigned int in_streams_;
+    unsigned int out_streams_;
+    boost::shared_ptr<gr::msg_queue> queue_;
+    concurrent_queue<int> *channel_internal_queue_;
+};
+
+#endif // GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_H_
--- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
+++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
@@ -16,6 +16,12 @@
 # along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
 #

+
+if(ENABLE_CUDA)
+	FIND_PACKAGE(CUDA REQUIRED)
+	set(OPT_TRACKING_BLOCKS ${OPT_TRACKING_BLOCKS} gps_l1_ca_dll_pll_tracking_gpu_cc.cc)
+endif(ENABLE_CUDA)
+
 set(TRACKING_GR_BLOCKS_SOURCES
     galileo_e1_dll_pll_veml_tracking_cc.cc
     galileo_volk_e1_dll_pll_veml_tracking_cc.cc
@@ -26,6 +32,7 @@ set(TRACKING_GR_BLOCKS_SOURCES
     gps_l1_ca_tcp_connector_tracking_cc.cc
     galileo_e5a_dll_pll_tracking_cc.cc
     gps_l2_m_dll_pll_tracking_cc.cc
+	 ${OPT_TRACKING_BLOCKS}   
 )

 include_directories(
@@ -40,6 +47,8 @@ include_directories(
     ${Boost_INCLUDE_DIRS}
     ${GNURADIO_RUNTIME_INCLUDE_DIRS}
     ${VOLK_GNSSSDR_INCLUDE_DIRS}
+     ${CUDA_INCLUDE_DIRS}
+     ${CMAKE_SOURCE_DIR}/src/algorithms/tracking/libs/cudahelpers
 )

 if(ENABLE_GENERIC_ARCH)
@@ -49,7 +58,8 @@ endif(ENABLE_GENERIC_ARCH)
 file(GLOB TRACKING_GR_BLOCKS_HEADERS "*.h")
 add_library(tracking_gr_blocks ${TRACKING_GR_BLOCKS_SOURCES} ${TRACKING_GR_BLOCKS_HEADERS})
 source_group(Headers FILES ${TRACKING_GR_BLOCKS_HEADERS})
-target_link_libraries(tracking_gr_blocks tracking_lib ${GNURADIO_RUNTIME_LIBRARIES} gnss_sp_libs ${Boost_LIBRARIES} ${VOLK_GNSSSDR_LIBRARIES} ${ORC_LIBRARIES} )
+
+target_link_libraries(tracking_gr_blocks tracking_lib ${GNURADIO_RUNTIME_LIBRARIES} gnss_sp_libs ${Boost_LIBRARIES} ${VOLK_GNSSSDR_LIBRARIES} ${ORC_LIBRARIES} ${CUDA_LIBRARIES})
 if(NOT VOLK_GNSSSDR_FOUND)
    add_dependencies(tracking_gr_blocks volk_gnsssdr_module)
 endif(NOT VOLK_GNSSSDR_FOUND)
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
@@ -594,7 +594,8 @@ int Gps_L1_Ca_Dll_Pll_Tracking_cc::general_work (int noutput_items, gr_vector_in

                    // carrier and code frequency
                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(float));
-                    d_dump_file.write(reinterpret_cast<char*>(&d_code_freq_chips), sizeof(float));
+                    tmp_float=d_code_freq_chips;
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float));

                    //PLL commands
                    d_dump_file.write(reinterpret_cast<char*>(&carr_error_hz), sizeof(float));
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
@@ -0,0 +1,610 @@
+/*!
+ * \file gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+ * \brief Implementation of a code DLL + carrier PLL tracking block, GPU ACCELERATED
+ * \author Javier Arribas, 2015. jarribas(at)cttc.es
+ *
+ * Code DLL + carrier PLL according to the algorithms described in:
+ * [1] K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
+ * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
+ * Approach, Birkhauser, 2007
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include "gps_l1_ca_dll_pll_tracking_gpu_cc.h"
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <boost/lexical_cast.hpp>
+#include <gnuradio/io_signature.h>
+#include <glog/logging.h>
+#include "gnss_synchro.h"
+#include "gps_sdr_signal_processing.h"
+#include "tracking_discriminators.h"
+#include "lock_detectors.h"
+#include "GPS_L1_CA.h"
+#include "control_message_factory.h"
+#include <volk/volk.h> //volk_alignement
+// includes
+#include <cuda_profiler_api.h>
+#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
+#include <helper_cuda.h>       // helper functions for CUDA error checking and initialization
+
+/*!
+ * \todo Include in definition header file
+ */
+#define CN0_ESTIMATION_SAMPLES 20
+#define MINIMUM_VALID_CN0 25
+#define MAXIMUM_LOCK_FAIL_COUNTER 50
+#define CARRIER_LOCK_THRESHOLD 0.85
+
+
+using google::LogMessage;
+
+gps_l1_ca_dll_pll_tracking_gpu_cc_sptr
+gps_l1_ca_dll_pll_make_tracking_gpu_cc(
+        long if_freq,
+        long fs_in,
+        unsigned int vector_length,
+        boost::shared_ptr<gr::msg_queue> queue,
+        bool dump,
+        std::string dump_filename,
+        float pll_bw_hz,
+        float dll_bw_hz,
+        float early_late_space_chips)
+{
+    return gps_l1_ca_dll_pll_tracking_gpu_cc_sptr(new Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(if_freq,
+            fs_in, vector_length, queue, dump, dump_filename, pll_bw_hz, dll_bw_hz, early_late_space_chips));
+}
+
+
+void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::forecast (int noutput_items,
+        gr_vector_int &ninput_items_required)
+{
+    ninput_items_required[0] = static_cast<int>(d_vector_length) * 2; //set the required available samples in each call
+}
+
+
+
+Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
+        long if_freq,
+        long fs_in,
+        unsigned int vector_length,
+        boost::shared_ptr<gr::msg_queue> queue,
+        bool dump,
+        std::string dump_filename,
+        float pll_bw_hz,
+        float dll_bw_hz,
+        float early_late_space_chips) :
+        gr::block("Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc", gr::io_signature::make(1, 1, sizeof(gr_complex)),
+                gr::io_signature::make(1, 1, sizeof(Gnss_Synchro)))
+{
+    // initialize internal vars
+    d_queue = queue;
+    d_dump = dump;
+    d_if_freq = if_freq;
+    d_fs_in = fs_in;
+    d_vector_length = vector_length;
+    d_dump_filename = dump_filename;
+
+    // Initialize tracking  ==========================================
+    d_code_loop_filter.set_DLL_BW(dll_bw_hz);
+    d_carrier_loop_filter.set_PLL_BW(pll_bw_hz);
+
+    //--- DLL variables --------------------------------------------------------
+    d_early_late_spc_chips = early_late_space_chips; // Define early-late offset (in chips)
+
+    // Initialization of local code replica
+    // Get space for a vector with the C/A code replica sampled 1x/chip
+    //d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS + 2) * sizeof(gr_complex), volk_get_alignment()));
+    d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment()));
+
+    multicorrelator_gpu = new cuda_multicorrelator();
+    int N_CORRELATORS=3;
+    //local code resampler on CPU (old)
+    //multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS);
+
+    //local code resampler on GPU (new)
+    multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS);
+
+    // Get space for the resampled early / prompt / late local replicas
+	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped ));
+
+
+    //allocate host memory
+    //pinned memory mode - use special function to get OS-pinned memory
+	checkCudaErrors(cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length  * sizeof(gr_complex),  cudaHostAllocMapped ));
+
+	//old local codes vector
+	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));
+
+	//new integrated shifts
+	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
+
+	// correlator outputs (scalar)
+	checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined ));
+	//map to EPL pointers
+    d_Early = &d_corr_outs_gpu[0];
+    d_Prompt =  &d_corr_outs_gpu[1];
+    d_Late = &d_corr_outs_gpu[2];
+
+    //--- Perform initializations ------------------------------
+    // define initial code frequency basis of NCO
+    d_code_freq_chips = GPS_L1_CA_CODE_RATE_HZ;
+    // define residual code phase (in chips)
+    d_rem_code_phase_samples = 0.0;
+    // define residual carrier phase
+    d_rem_carr_phase_rad = 0.0;
+
+    // sample synchronization
+    d_sample_counter = 0;
+    //d_sample_counter_seconds = 0;
+    d_acq_sample_stamp = 0;
+
+    d_enable_tracking = false;
+    d_pull_in = false;
+    d_last_seg = 0;
+
+    d_current_prn_length_samples = static_cast<int>(d_vector_length);
+
+    // CN0 estimation and lock detector buffers
+    d_cn0_estimation_counter = 0;
+    d_Prompt_buffer = new gr_complex[CN0_ESTIMATION_SAMPLES];
+    d_carrier_lock_test = 1;
+    d_CN0_SNV_dB_Hz = 0;
+    d_carrier_lock_fail_counter = 0;
+    d_carrier_lock_threshold = CARRIER_LOCK_THRESHOLD;
+
+    systemName["G"] = std::string("GPS");
+    systemName["S"] = std::string("SBAS");
+
+
+    set_relative_rate(1.0/((double)d_vector_length*2));
+
+    d_channel_internal_queue = 0;
+    d_acquisition_gnss_synchro = 0;
+    d_channel = 0;
+    d_acq_code_phase_samples = 0.0;
+    d_acq_carrier_doppler_hz = 0.0;
+    d_carrier_doppler_hz = 0.0;
+    d_acc_carrier_phase_rad = 0.0;
+    d_code_phase_samples = 0.0;
+    d_acc_code_phase_secs = 0.0;
+    //set_min_output_buffer((long int)300);
+}
+
+
+void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking()
+{
+    /*
+     *  correct the code phase according to the delay between acq and trk
+     */
+    d_acq_code_phase_samples = d_acquisition_gnss_synchro->Acq_delay_samples;
+    d_acq_carrier_doppler_hz = d_acquisition_gnss_synchro->Acq_doppler_hz;
+    d_acq_sample_stamp =  d_acquisition_gnss_synchro->Acq_samplestamp_samples;
+
+    long int acq_trk_diff_samples;
+    float acq_trk_diff_seconds;
+    acq_trk_diff_samples = static_cast<long int>(d_sample_counter) - static_cast<long int>(d_acq_sample_stamp);//-d_vector_length;
+    DLOG(INFO) << "Number of samples between Acquisition and Tracking =" << acq_trk_diff_samples;
+    acq_trk_diff_seconds = static_cast<float>(acq_trk_diff_samples) / static_cast<float>(d_fs_in);
+    //doppler effect
+    // Fd=(C/(C+Vr))*F
+    float radial_velocity = (GPS_L1_FREQ_HZ + d_acq_carrier_doppler_hz) / GPS_L1_FREQ_HZ;
+    // new chip and prn sequence periods based on acq Doppler
+    float T_chip_mod_seconds;
+    float T_prn_mod_seconds;
+    float T_prn_mod_samples;
+    d_code_freq_chips = radial_velocity * GPS_L1_CA_CODE_RATE_HZ;
+    T_chip_mod_seconds = 1/d_code_freq_chips;
+    T_prn_mod_seconds = T_chip_mod_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS;
+    T_prn_mod_samples = T_prn_mod_seconds * static_cast<float>(d_fs_in);
+
+    d_current_prn_length_samples = round(T_prn_mod_samples);
+
+    float T_prn_true_seconds = GPS_L1_CA_CODE_LENGTH_CHIPS / GPS_L1_CA_CODE_RATE_HZ;
+    float T_prn_true_samples = T_prn_true_seconds * static_cast<float>(d_fs_in);
+    float T_prn_diff_seconds=  T_prn_true_seconds - T_prn_mod_seconds;
+    float N_prn_diff = acq_trk_diff_seconds / T_prn_true_seconds;
+    float corrected_acq_phase_samples, delay_correction_samples;
+    corrected_acq_phase_samples = fmod((d_acq_code_phase_samples + T_prn_diff_seconds * N_prn_diff * static_cast<float>(d_fs_in)), T_prn_true_samples);
+    if (corrected_acq_phase_samples < 0)
+        {
+            corrected_acq_phase_samples = T_prn_mod_samples + corrected_acq_phase_samples;
+        }
+    delay_correction_samples = d_acq_code_phase_samples - corrected_acq_phase_samples;
+
+    d_acq_code_phase_samples = corrected_acq_phase_samples;
+
+    d_carrier_doppler_hz = d_acq_carrier_doppler_hz;
+
+    // DLL/PLL filter initialization
+    d_carrier_loop_filter.initialize(); // initialize the carrier filter
+    d_code_loop_filter.initialize();    // initialize the code filter
+
+    // generate local reference ALWAYS starting at chip 1 (1 sample per chip)
+    gps_l1_ca_code_gen_complex(d_ca_code, d_acquisition_gnss_synchro->PRN, 0);
+
+    d_local_code_shift_chips[0]=-d_early_late_spc_chips;
+    d_local_code_shift_chips[1]=0.0;
+    d_local_code_shift_chips[2]=d_early_late_spc_chips;
+
+    multicorrelator_gpu->set_local_code_and_taps(GPS_L1_CA_CODE_LENGTH_CHIPS,d_ca_code, d_local_code_shift_chips,3);
+
+    d_carrier_lock_fail_counter = 0;
+    d_rem_code_phase_samples = 0;
+    d_rem_carr_phase_rad = 0;
+    d_acc_carrier_phase_rad = 0;
+    d_acc_code_phase_secs = 0;
+
+    d_code_phase_samples = d_acq_code_phase_samples;
+
+    std::string sys_ = &d_acquisition_gnss_synchro->System;
+    sys = sys_.substr(0,1);
+
+    // DEBUG OUTPUT
+    std::cout << "Tracking start on channel " << d_channel << " for satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) << std::endl;
+    LOG(INFO) << "Starting tracking of satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) << " on channel " << d_channel;
+
+
+    // enable tracking
+    d_pull_in = true;
+    d_enable_tracking = true;
+
+    LOG(INFO) << "PULL-IN Doppler [Hz]=" << d_carrier_doppler_hz
+            << " Code Phase correction [samples]=" << delay_correction_samples
+            << " PULL-IN Code Phase [samples]=" << d_acq_code_phase_samples;
+}
+
+
+Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
+{
+    d_dump_file.close();
+
+	cudaFreeHost(in_gpu);
+	cudaFreeHost(d_carr_sign_gpu);
+	cudaFreeHost(d_corr_outs_gpu);
+	cudaFreeHost(d_local_code_shift_chips);
+
+	multicorrelator_gpu->free_cuda();
+	delete(multicorrelator_gpu);
+
+    volk_free(d_ca_code);
+
+    delete[] d_Prompt_buffer;
+}
+
+
+
+int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vector_int &ninput_items,
+        gr_vector_const_void_star &input_items, gr_vector_void_star &output_items)
+{
+    // process vars
+    float carr_error_hz=0.0;
+    float carr_error_filt_hz=0.0;
+    float code_error_chips=0.0;
+    float code_error_filt_chips=0.0;
+
+    // Block input data and block output stream pointers
+    const gr_complex* in = (gr_complex*) input_items[0];
+    Gnss_Synchro **out = (Gnss_Synchro **) &output_items[0];
+
+    // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder
+    Gnss_Synchro current_synchro_data = Gnss_Synchro();
+
+    if (d_enable_tracking == true)
+        {
+            // Receiver signal alignment
+            if (d_pull_in == true)
+                {
+                    int samples_offset;
+                    int acq_to_trk_delay_samples;
+                    acq_to_trk_delay_samples = d_sample_counter - d_acq_sample_stamp;
+                    samples_offset = round(d_acq_code_phase_samples)+d_current_prn_length_samples - acq_to_trk_delay_samples%d_current_prn_length_samples;
+                    d_sample_counter = d_sample_counter + samples_offset; //count for the processed samples
+                    d_pull_in = false;
+                    // Fill the acquisition data
+                    current_synchro_data = *d_acquisition_gnss_synchro;
+                    *out[0] = current_synchro_data;
+                    consume_each(samples_offset); //shift input to perform alignment with local replica
+                    return 1;
+                }
+
+            // Fill the acquisition data
+            current_synchro_data = *d_acquisition_gnss_synchro;
+
+            // UPDATE NCO COMMAND
+            float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
+
+        	//code resampler on GPU (new)
+            float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in);
+            float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in);
+
+            cudaProfilerStart();
+            multicorrelator_gpu->Carrier_wipeoff_multicorrelator_resampler_cuda(
+    				d_corr_outs_gpu,
+    				in,
+    				d_rem_carr_phase_rad,
+    				phase_step_rad,
+    				code_phase_step_chips,
+    				rem_code_phase_chips,
+    				d_current_prn_length_samples,
+    				3);
+        	cudaProfilerStop();
+
+            // ################## PLL ##########################################################
+            // PLL discriminator
+            carr_error_hz = pll_cloop_two_quadrant_atan(*d_Prompt) / static_cast<float>(GPS_TWO_PI);
+            // Carrier discriminator filter
+            carr_error_filt_hz = d_carrier_loop_filter.get_carrier_nco(carr_error_hz);
+            // New carrier Doppler frequency estimation
+            d_carrier_doppler_hz = d_acq_carrier_doppler_hz + carr_error_filt_hz;
+            // New code Doppler frequency estimation
+            d_code_freq_chips = GPS_L1_CA_CODE_RATE_HZ + ((d_carrier_doppler_hz * GPS_L1_CA_CODE_RATE_HZ) / GPS_L1_FREQ_HZ);
+            //carrier phase accumulator for (K) doppler estimation
+            d_acc_carrier_phase_rad = d_acc_carrier_phase_rad + GPS_TWO_PI * d_carrier_doppler_hz * GPS_L1_CA_CODE_PERIOD;
+            //remanent carrier phase to prevent overflow in the code NCO
+            d_rem_carr_phase_rad = d_rem_carr_phase_rad + GPS_TWO_PI * d_carrier_doppler_hz * GPS_L1_CA_CODE_PERIOD;
+            d_rem_carr_phase_rad = fmod(d_rem_carr_phase_rad, GPS_TWO_PI);
+
+            // ################## DLL ##########################################################
+            // DLL discriminator
+            code_error_chips = dll_nc_e_minus_l_normalized(*d_Early, *d_Late); //[chips/Ti]
+            // Code discriminator filter
+            code_error_filt_chips = d_code_loop_filter.get_code_nco(code_error_chips); //[chips/second]
+            //Code phase accumulator
+            float code_error_filt_secs;
+            code_error_filt_secs = (GPS_L1_CA_CODE_PERIOD * code_error_filt_chips) / GPS_L1_CA_CODE_RATE_HZ; //[seconds]
+            d_acc_code_phase_secs = d_acc_code_phase_secs + code_error_filt_secs;
+
+            // ################## CARRIER AND CODE NCO BUFFER ALIGNEMENT #######################
+            // keep alignment parameters for the next input buffer
+            double T_chip_seconds;
+            double T_prn_seconds;
+            double T_prn_samples;
+            double K_blk_samples;
+            // Compute the next buffer length based in the new period of the PRN sequence and the code phase error estimation
+            T_chip_seconds = 1 / static_cast<double>(d_code_freq_chips);
+            T_prn_seconds = T_chip_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS;
+            T_prn_samples = T_prn_seconds * static_cast<double>(d_fs_in);
+            K_blk_samples = T_prn_samples + d_rem_code_phase_samples + static_cast<double>(code_error_filt_secs) * static_cast<double>(d_fs_in);
+            //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample
+
+            // ####### CN0 ESTIMATION AND LOCK DETECTORS ######
+            if (d_cn0_estimation_counter < CN0_ESTIMATION_SAMPLES)
+                {
+                    // fill buffer with prompt correlator output values
+                    d_Prompt_buffer[d_cn0_estimation_counter] = *d_Prompt;
+                    d_cn0_estimation_counter++;
+                }
+            else
+                {
+                    d_cn0_estimation_counter = 0;
+                    // Code lock indicator
+                    d_CN0_SNV_dB_Hz = cn0_svn_estimator(d_Prompt_buffer, CN0_ESTIMATION_SAMPLES, d_fs_in, GPS_L1_CA_CODE_LENGTH_CHIPS);
+                    // Carrier lock indicator
+                    d_carrier_lock_test = carrier_lock_detector(d_Prompt_buffer, CN0_ESTIMATION_SAMPLES);
+                    // Loss of lock detection
+                    if (d_carrier_lock_test < d_carrier_lock_threshold or d_CN0_SNV_dB_Hz < MINIMUM_VALID_CN0)
+                        {
+                            d_carrier_lock_fail_counter++;
+                        }
+                    else
+                        {
+                            if (d_carrier_lock_fail_counter > 0) d_carrier_lock_fail_counter--;
+                        }
+                    if (d_carrier_lock_fail_counter > MAXIMUM_LOCK_FAIL_COUNTER)
+                        {
+                            std::cout << "Loss of lock in channel " << d_channel << "!" << std::endl;
+                            LOG(INFO) << "Loss of lock in channel " << d_channel << "!";
+                            std::unique_ptr<ControlMessageFactory> cmf(new ControlMessageFactory());
+                            if (d_queue != gr::msg_queue::sptr())
+                                {
+                                    d_queue->handle(cmf->GetQueueMessage(d_channel, 2));
+                                }
+                            d_carrier_lock_fail_counter = 0;
+                            d_enable_tracking = false; // TODO: check if disabling tracking is consistent with the channel state machine
+                        }
+                }
+            // ########### Output the tracking data to navigation and PVT ##########
+            current_synchro_data.Prompt_I = static_cast<double>((*d_Prompt).real());
+            current_synchro_data.Prompt_Q = static_cast<double>((*d_Prompt).imag());
+
+            // Tracking_timestamp_secs is aligned with the NEXT PRN start sample (Hybridization problem!)
+            //compute remnant code phase samples BEFORE the Tracking timestamp
+            //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample
+            //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter + (double)d_current_prn_length_samples + (double)d_rem_code_phase_samples)/static_cast<double>(d_fs_in);
+
+            // Tracking_timestamp_secs is aligned with the CURRENT PRN start sample (Hybridization OK!, but some glitches??)
+            current_synchro_data.Tracking_timestamp_secs = (static_cast<double>(d_sample_counter) + static_cast<double>(d_rem_code_phase_samples)) / static_cast<double>(d_fs_in);
+            //compute remnant code phase samples AFTER the Tracking timestamp
+            d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample
+
+            //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter)/static_cast<double>(d_fs_in);
+            // This tracking block aligns the Tracking_timestamp_secs with the start sample of the PRN, thus, Code_phase_secs=0
+            current_synchro_data.Code_phase_secs = 0;
+            current_synchro_data.Carrier_phase_rads = static_cast<double>(d_acc_carrier_phase_rad);
+            current_synchro_data.Carrier_Doppler_hz = static_cast<double>(d_carrier_doppler_hz);
+            current_synchro_data.CN0_dB_hz = static_cast<double>(d_CN0_SNV_dB_Hz);
+            current_synchro_data.Flag_valid_pseudorange = false;
+            *out[0] = current_synchro_data;
+
+            // ########## DEBUG OUTPUT
+            /*!
+             *  \todo The stop timer has to be moved to the signal source!
+             */
+            // debug: Second counter in channel 0
+            if (d_channel == 0)
+                {
+                    if (floor(d_sample_counter / d_fs_in) != d_last_seg)
+                        {
+                            d_last_seg = floor(d_sample_counter / d_fs_in);
+                            std::cout << "Current input signal time = " << d_last_seg << " [s]" << std::endl;
+                            DLOG(INFO) << "GPS L1 C/A Tracking CH " << d_channel <<  ": Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN)
+                                      << ", CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]" << std::endl;
+                            //if (d_last_seg==5) d_carrier_lock_fail_counter=500; //DEBUG: force unlock!
+                        }
+                }
+            else
+                {
+                    if (floor(d_sample_counter / d_fs_in) != d_last_seg)
+                        {
+                            d_last_seg = floor(d_sample_counter / d_fs_in);
+                            DLOG(INFO) << "Tracking CH " << d_channel <<  ": Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN)
+                                       << ", CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]";
+                        }
+                }
+        }
+    else
+        {
+            // ########## DEBUG OUTPUT (TIME ONLY for channel 0 when tracking is disabled)
+            /*!
+             *  \todo The stop timer has to be moved to the signal source!
+             */
+            // stream to collect cout calls to improve thread safety
+            std::stringstream tmp_str_stream;
+            if (floor(d_sample_counter / d_fs_in) != d_last_seg)
+                {
+                    d_last_seg = floor(d_sample_counter / d_fs_in);
+
+                    if (d_channel == 0)
+                        {
+                            // debug: Second counter in channel 0
+                            tmp_str_stream << "Current input signal time = " << d_last_seg << " [s]" << std::endl << std::flush;
+                            std::cout << tmp_str_stream.rdbuf() << std::flush;
+                        }
+                }
+            *d_Early = gr_complex(0,0);
+            *d_Prompt = gr_complex(0,0);
+            *d_Late = gr_complex(0,0);
+
+            current_synchro_data.System = {'G'};
+            current_synchro_data.Flag_valid_pseudorange = false;
+            *out[0] = current_synchro_data;
+        }
+
+    if(d_dump)
+        {
+            // MULTIPLEXED FILE RECORDING - Record results to file
+            float prompt_I;
+            float prompt_Q;
+            float tmp_E, tmp_P, tmp_L;
+            float tmp_float;
+            double tmp_double;
+            prompt_I = (*d_Prompt).real();
+            prompt_Q = (*d_Prompt).imag();
+            tmp_E = std::abs<float>(*d_Early);
+            tmp_P = std::abs<float>(*d_Prompt);
+            tmp_L = std::abs<float>(*d_Late);
+            try
+            {
+                    // EPR
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_E), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_P), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_L), sizeof(float));
+                    // PROMPT I and Q (to analyze navigation symbols)
+                    d_dump_file.write(reinterpret_cast<char*>(&prompt_I), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&prompt_Q), sizeof(float));
+                    // PRN start sample stamp
+                    //tmp_float=(float)d_sample_counter;
+                    d_dump_file.write(reinterpret_cast<char*>(&d_sample_counter), sizeof(unsigned long int));
+                    // accumulated carrier phase
+                    d_dump_file.write(reinterpret_cast<char*>(&d_acc_carrier_phase_rad), sizeof(float));
+
+                    // carrier and code frequency
+                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(float));
+                    tmp_float=d_code_freq_chips;
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float));
+
+                    //PLL commands
+                    d_dump_file.write(reinterpret_cast<char*>(&carr_error_hz), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&carr_error_filt_hz), sizeof(float));
+
+                    //DLL commands
+                    d_dump_file.write(reinterpret_cast<char*>(&code_error_chips), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&code_error_filt_chips), sizeof(float));
+
+                    // CN0 and carrier lock test
+                    d_dump_file.write(reinterpret_cast<char*>(&d_CN0_SNV_dB_Hz), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_lock_test), sizeof(float));
+
+                    // AUX vars (for debug purposes)
+                    tmp_float = d_rem_code_phase_samples;
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float));
+                    tmp_double = static_cast<double>(d_sample_counter + d_current_prn_length_samples);
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_double), sizeof(double));
+            }
+            catch (std::ifstream::failure e)
+            {
+                    LOG(WARNING) << "Exception writing trk dump file " << e.what();
+            }
+        }
+
+    consume_each(d_current_prn_length_samples); // this is necessary in gr::block derivates
+    d_sample_counter += d_current_prn_length_samples; //count for the processed samples
+    //LOG(INFO)<<"GPS tracking output end on CH="<<this->d_channel << " SAMPLE STAMP="<<d_sample_counter<<std::endl;
+    return 1; //output tracking result ALWAYS even in the case of d_enable_tracking==false
+}
+
+
+
+void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::set_channel(unsigned int channel)
+{
+    d_channel = channel;
+    LOG(INFO) << "Tracking Channel set to " << d_channel;
+    // ############# ENABLE DATA FILE LOG #################
+    if (d_dump == true)
+        {
+            if (d_dump_file.is_open() == false)
+                {
+                    try
+                    {
+                            d_dump_filename.append(boost::lexical_cast<std::string>(d_channel));
+                            d_dump_filename.append(".dat");
+                            d_dump_file.exceptions (std::ifstream::failbit | std::ifstream::badbit);
+                            d_dump_file.open(d_dump_filename.c_str(), std::ios::out | std::ios::binary);
+                            LOG(INFO) << "Tracking dump enabled on channel " << d_channel << " Log file: " << d_dump_filename.c_str() << std::endl;
+                    }
+                    catch (std::ifstream::failure e)
+                    {
+                            LOG(WARNING) << "channel " << d_channel << " Exception opening trk dump file " << e.what() << std::endl;
+                    }
+                }
+        }
+}
+
+
+
+void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::set_channel_queue(concurrent_queue<int> *channel_internal_queue)
+{
+    d_channel_internal_queue = channel_internal_queue;
+}
+
+
+void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro)
+{
+    d_acquisition_gnss_synchro = p_gnss_synchro;
+}
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
@@ -0,0 +1,191 @@
+/*!
+ * \file gps_l1_ca_dll_pll_tracking_gpu_cc.h
+ * \brief Implementation of a code DLL + carrier PLL tracking block, GPU ACCELERATED
+ * \author Javier Arribas, 2015. jarribas(at)cttc.es
+ *
+ * Code DLL + carrier PLL according to the algorithms described in:
+ * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
+ * A Software-Defined GPS and Galileo Receiver. A Single-Frequency Approach,
+ * Birkhauser, 2007
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_CC_H
+#define	GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_CC_H
+
+#include <fstream>
+#include <queue>
+#include <map>
+#include <string>
+#include <boost/thread/mutex.hpp>
+#include <boost/thread/thread.hpp>
+#include <gnuradio/block.h>
+#include <gnuradio/msg_queue.h>
+#include "concurrent_queue.h"
+#include "gps_sdr_signal_processing.h"
+#include "gnss_synchro.h"
+#include "tracking_2nd_DLL_filter.h"
+#include "tracking_2nd_PLL_filter.h"
+#include "cuda_multicorrelator.h"
+
+class Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc;
+
+typedef boost::shared_ptr<Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc>
+        gps_l1_ca_dll_pll_tracking_gpu_cc_sptr;
+
+gps_l1_ca_dll_pll_tracking_gpu_cc_sptr
+gps_l1_ca_dll_pll_make_tracking_gpu_cc(long if_freq,
+                                   long fs_in, unsigned
+                                   int vector_length,
+                                   boost::shared_ptr<gr::msg_queue> queue,
+                                   bool dump,
+                                   std::string dump_filename,
+                                   float pll_bw_hz,
+                                   float dll_bw_hz,
+                                   float early_late_space_chips);
+
+
+
+/*!
+ * \brief This class implements a DLL + PLL tracking loop block
+ */
+class Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc: public gr::block
+{
+public:
+    ~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc();
+
+    void set_channel(unsigned int channel);
+    void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro);
+    void start_tracking();
+    void set_channel_queue(concurrent_queue<int> *channel_internal_queue);
+
+    int general_work (int noutput_items, gr_vector_int &ninput_items,
+            gr_vector_const_void_star &input_items, gr_vector_void_star &output_items);
+
+    void forecast (int noutput_items, gr_vector_int &ninput_items_required);
+
+private:
+    friend gps_l1_ca_dll_pll_tracking_gpu_cc_sptr
+    gps_l1_ca_dll_pll_make_tracking_gpu_cc(long if_freq,
+            long fs_in, unsigned
+            int vector_length,
+            boost::shared_ptr<gr::msg_queue> queue,
+            bool dump,
+            std::string dump_filename,
+            float pll_bw_hz,
+            float dll_bw_hz,
+            float early_late_space_chips);
+
+    Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(long if_freq,
+            long fs_in, unsigned
+            int vector_length,
+            boost::shared_ptr<gr::msg_queue> queue,
+            bool dump,
+            std::string dump_filename,
+            float pll_bw_hz,
+            float dll_bw_hz,
+            float early_late_space_chips);
+    void update_local_code();
+    void update_local_carrier();
+
+    // tracking configuration vars
+    boost::shared_ptr<gr::msg_queue> d_queue;
+    concurrent_queue<int> *d_channel_internal_queue;
+    unsigned int d_vector_length;
+    bool d_dump;
+
+    Gnss_Synchro* d_acquisition_gnss_synchro;
+    unsigned int d_channel;
+    int d_last_seg;
+    long d_if_freq;
+    long d_fs_in;
+
+    double d_early_late_spc_chips;
+
+
+    //GPU HOST PINNED MEMORY IN/OUT VECTORS
+    gr_complex* in_gpu;
+    gr_complex* d_carr_sign_gpu;
+    gr_complex* d_local_codes_gpu;
+	float* d_local_code_shift_chips;
+    gr_complex* d_corr_outs_gpu;
+    cuda_multicorrelator *multicorrelator_gpu;
+
+
+    gr_complex* d_ca_code;
+
+    gr_complex *d_Early;
+    gr_complex *d_Prompt;
+    gr_complex *d_Late;
+
+
+    // remaining code phase and carrier phase between tracking loops
+    double d_rem_code_phase_samples;
+    float d_rem_carr_phase_rad;
+
+    // PLL and DLL filter library
+    Tracking_2nd_DLL_filter d_code_loop_filter;
+    Tracking_2nd_PLL_filter d_carrier_loop_filter;
+
+    // acquisition
+    float d_acq_code_phase_samples;
+    float d_acq_carrier_doppler_hz;
+
+    // tracking vars
+    double d_code_freq_chips;
+    float d_carrier_doppler_hz;
+    float d_acc_carrier_phase_rad;
+    float d_code_phase_samples;
+    float d_acc_code_phase_secs;
+
+    //PRN period in samples
+    int d_current_prn_length_samples;
+
+    //processing samples counters
+    unsigned long int d_sample_counter;
+    unsigned long int d_acq_sample_stamp;
+
+    // CN0 estimation and lock detector
+    int d_cn0_estimation_counter;
+    gr_complex* d_Prompt_buffer;
+    float d_carrier_lock_test;
+    float d_CN0_SNV_dB_Hz;
+    float d_carrier_lock_threshold;
+    int d_carrier_lock_fail_counter;
+
+    // control vars
+    bool d_enable_tracking;
+    bool d_pull_in;
+
+    // file dump
+    std::string d_dump_filename;
+    std::ofstream d_dump_file;
+
+    std::map<std::string, std::string> systemName;
+    std::string sys;
+};
+
+#endif //GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_CC_H
--- a/src/algorithms/tracking/libs/CMakeLists.txt
+++ b/src/algorithms/tracking/libs/CMakeLists.txt
@@ -16,6 +16,26 @@
 # along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
 #

+
+if(ENABLE_CUDA)
+	FIND_PACKAGE(CUDA REQUIRED)
+	
+	# Append current NVCC flags by something, eg comput capability
+	# set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30)
+	
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_30,code=sm_30; -std=c++11;-O3; -use_fast_math -default-stream per-thread")
+	SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
+	
+	CUDA_INCLUDE_DIRECTORIES(
+     ${CMAKE_CURRENT_SOURCE_DIR}
+     ${CMAKE_CURRENT_SOURCE_DIR}/cudahelpers
+	)
+
+	SET(LIB_TYPE STATIC) #set the lib type
+	CUDA_ADD_LIBRARY(CUDA_CORRELATOR_LIB ${LIB_TYPE} cuda_multicorrelator.h cuda_multicorrelator.cu)
+endif(ENABLE_CUDA)
+
+
 set(TRACKING_LIB_SOURCES   
     correlator.cc
     lock_detectors.cc
@@ -24,7 +44,7 @@ set(TRACKING_LIB_SOURCES
     tracking_2nd_DLL_filter.cc
     tracking_2nd_PLL_filter.cc
     tracking_discriminators.cc
-     tracking_FLL_PLL_filter.cc     
+     tracking_FLL_PLL_filter.cc
 )

 include_directories(
@@ -33,6 +53,7 @@ include_directories(
     ${CMAKE_SOURCE_DIR}/src/core/interfaces
     ${CMAKE_SOURCE_DIR}/src/core/receiver
     ${VOLK_INCLUDE_DIRS}
+     ${CUDA_INCLUDE_DIRS}
 )

 if(ENABLE_GENERIC_ARCH)
@@ -43,7 +64,8 @@ if (SSE3_AVAILABLE)
    add_definitions( -DHAVE_SSE3=1 )
 endif(SSE3_AVAILABLE)

+
 file(GLOB TRACKING_LIB_HEADERS "*.h")
 add_library(tracking_lib ${TRACKING_LIB_SOURCES} ${TRACKING_LIB_HEADERS})
 source_group(Headers FILES ${TRACKING_LIB_HEADERS})
-target_link_libraries(tracking_lib ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES})
+target_link_libraries(tracking_lib CUDA_CORRELATOR_LIB ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES})
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -0,0 +1,714 @@
+/*!
+ * \file cuda_multicorrelator.cu
+ * \brief High optimized CUDA GPU vector multiTAP correlator class
+ * \authors <ul>
+ *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
+ *          </ul>
+ *
+ * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+// On G80-class hardware 24-bit multiplication takes 4 clocks per warp
+// (the same as for floating point  multiplication and addition),
+// whereas full 32-bit multiplication takes 16 clocks per warp.
+// So if integer multiplication operands are  guaranteed to fit into 24 bits
+// (always lie withtin [-8M, 8M - 1] range in signed case),
+// explicit 24-bit multiplication is preferred for performance.
+///////////////////////////////////////////////////////////////////////////////
+#define IMUL(a, b) __mul24(a, b)
+
+#include "cuda_multicorrelator.h"
+
+#include <stdio.h>
+
+// For the CUDA runtime routines (prefixed with "cuda_")
+#include <cuda_runtime.h>
+
+// helper functions and utilities to work with CUDA
+#include <helper_cuda.h>
+#include <helper_functions.h>
+
+#define ACCUM_N 256
+
+
+__global__ void scalarProdGPUCPXxN_shifts_chips(
+    GPU_Complex *d_corr_out,
+    GPU_Complex *d_sig_in,
+    GPU_Complex *d_local_code_in,
+    float *d_shifts_chips,
+    float code_length_chips,
+    float code_phase_step_chips,
+    float rem_code_phase_chips,
+    int vectorN,
+    int elementN
+)
+{
+    //Accumulators cache
+    __shared__ GPU_Complex accumResult[ACCUM_N];
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Cycle through every pair of vectors,
+    // taking into account that vector counts can be different
+    // from total number of thread blocks
+    ////////////////////////////////////////////////////////////////////////////
+    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+    {
+        //int vectorBase = IMUL(elementN, vec);
+        //int vectorEnd  = elementN;
+
+        ////////////////////////////////////////////////////////////////////////
+        // Each accumulator cycles through vectors with
+        // stride equal to number of total number of accumulators ACCUM_N
+        // At this stage ACCUM_N is only preferred be a multiple of warp size
+        // to meet memory coalescing alignment constraints.
+        ////////////////////////////////////////////////////////////////////////
+        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+        {
+        	GPU_Complex sum = GPU_Complex(0,0);
+
+            for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
+            {
+                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
+            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
+            	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
+
+            	// 1.resample local code for the current shift
+            	float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
+            	//TODO: Take into account that in multitap correlators, the shifts can be negative!
+            	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
+
+            	// 2.correlate
+            	sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
+
+            }
+            accumResult[iAccum] = sum;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Perform tree-like reduction of accumulators' results.
+        // ACCUM_N has to be power of two at this stage
+        ////////////////////////////////////////////////////////////////////////
+        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+        {
+            __syncthreads();
+
+            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+            {
+                accumResult[iAccum] += accumResult[stride + iAccum];
+            }
+        }
+
+        if (threadIdx.x == 0)
+        	{
+        		d_corr_out[vec] = accumResult[0];
+        	}
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Calculate scalar products of VectorN vectors of ElementN elements on GPU
+// Parameters restrictions:
+// 1) ElementN is strongly preferred to be a multiple of warp size to
+//    meet alignment constraints of memory coalescing.
+// 2) ACCUM_N must be a power of two.
+///////////////////////////////////////////////////////////////////////////////
+
+
+__global__ void scalarProdGPUCPXxN_shifts(
+    GPU_Complex *d_corr_out,
+    GPU_Complex *d_sig_in,
+    GPU_Complex *d_local_codes_in,
+    int *d_shifts_samples,
+    int vectorN,
+    int elementN
+)
+{
+    //Accumulators cache
+    __shared__ GPU_Complex accumResult[ACCUM_N];
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Cycle through every pair of vectors,
+    // taking into account that vector counts can be different
+    // from total number of thread blocks
+    ////////////////////////////////////////////////////////////////////////////
+    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+    {
+        int vectorBase = IMUL(elementN, vec);
+        int vectorEnd  = vectorBase + elementN;
+
+        ////////////////////////////////////////////////////////////////////////
+        // Each accumulator cycles through vectors with
+        // stride equal to number of total number of accumulators ACCUM_N
+        // At this stage ACCUM_N is only preferred be a multiple of warp size
+        // to meet memory coalescing alignment constraints.
+        ////////////////////////////////////////////////////////////////////////
+        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+        {
+        	GPU_Complex sum = GPU_Complex(0,0);
+
+            for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
+            {
+                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
+            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
+            	sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos-vectorBase+d_shifts_samples[vec]]);
+            }
+            accumResult[iAccum] = sum;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Perform tree-like reduction of accumulators' results.
+        // ACCUM_N has to be power of two at this stage
+        ////////////////////////////////////////////////////////////////////////
+        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+        {
+            __syncthreads();
+
+            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+            {
+                accumResult[iAccum] += accumResult[stride + iAccum];
+            }
+        }
+
+        if (threadIdx.x == 0)
+        	{
+        		d_corr_out[vec] = accumResult[0];
+        	}
+    }
+}
+
+
+__global__ void scalarProdGPUCPXxN(
+    GPU_Complex *d_corr_out,
+    GPU_Complex *d_sig_in,
+    GPU_Complex *d_local_codes_in,
+    int vectorN,
+    int elementN
+)
+{
+    //Accumulators cache
+    __shared__ GPU_Complex accumResult[ACCUM_N];
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Cycle through every pair of vectors,
+    // taking into account that vector counts can be different
+    // from total number of thread blocks
+    ////////////////////////////////////////////////////////////////////////////
+    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+    {
+        //int vectorBase = IMUL(elementN, vec);
+        //int vectorEnd  = vectorBase + elementN;
+
+
+        ////////////////////////////////////////////////////////////////////////
+        // Each accumulator cycles through vectors with
+        // stride equal to number of total number of accumulators ACCUM_N
+        // At this stage ACCUM_N is only preferred be a multiple of warp size
+        // to meet memory coalescing alignment constraints.
+        ////////////////////////////////////////////////////////////////////////
+        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+        {
+        	GPU_Complex sum = GPU_Complex(0,0);
+
+            //for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
+        	for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
+            {
+                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
+            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
+            	//sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]);
+        		sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos]);
+            }
+            accumResult[iAccum] = sum;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Perform tree-like reduction of accumulators' results.
+        // ACCUM_N has to be power of two at this stage
+        ////////////////////////////////////////////////////////////////////////
+        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+        {
+            __syncthreads();
+
+            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+            {
+                accumResult[iAccum] += accumResult[stride + iAccum];
+            }
+        }
+
+        if (threadIdx.x == 0)
+        	{
+        		d_corr_out[vec] = accumResult[0];
+        	}
+    }
+}
+
+
+//*********** CUDA processing **************
+// Treads: a minimal parallel execution code on GPU
+// Blocks: a set of N threads
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vectorial product of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void CUDA_32fc_x2_multiply_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements)
+{
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < numElements;
+         i += blockDim.x * gridDim.x)
+    {
+        C[i] =  A[i] * B[i];
+    }
+}
+
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the carrier Doppler wipe-off by integrating the NCO in the CUDA kernel
+ */
+__global__ void
+CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
+{
+	//*** NCO CPU code (GNURadio FXP NCO)
+	//float sin_f, cos_f;
+	//float phase_step_rad = static_cast<float>(2 * GALILEO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
+	//int phase_step_rad_i = gr::fxpt::float_to_fixed(phase_step_rad);
+	//int phase_rad_i = gr::fxpt::float_to_fixed(d_rem_carr_phase_rad);
+	//
+	//for(int i = 0; i < d_current_prn_length_samples; i++)
+	//    {
+	//        gr::fxpt::sincos(phase_rad_i, &sin_f, &cos_f);
+	//        d_carr_sign[i] = std::complex<float>(cos_f, -sin_f);
+	//        phase_rad_i += phase_step_rad_i;
+	//    }
+
+	// CUDA version of floating point NCO and vector dot product integrated
+
+    float sin;
+    float cos;
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < numElements;
+         i += blockDim.x * gridDim.x)
+    {
+    	__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
+    	sig_out[i] =  sig_in[i] * GPU_Complex(cos,-sin);
+    }
+}
+
+
+/**
+ * CUDA Kernel Device code
+ *
+ * Computes the vectorial product of A and B into C. The 3 vectors have the same
+ * number of elements numElements.
+ */
+__global__ void
+CUDA_32fc_x2_add_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements)
+{
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < numElements;
+         i += blockDim.x * gridDim.x)
+    {
+        C[i] =  A[i] + B[i];
+    }
+}
+
+
+bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators)
+{
+	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
+//	findCudaDevice(argc, (const char **)argv);
+//      cudaDeviceProp  prop;
+//    int num_devices, device;
+//    cudaGetDeviceCount(&num_devices);
+//    if (num_devices > 1) {
+//          int max_multiprocessors = 0, max_device = 0;
+//          for (device = 0; device < num_devices; device++) {
+//                  cudaDeviceProp properties;
+//                  cudaGetDeviceProperties(&properties, device);
+//                  if (max_multiprocessors < properties.multiProcessorCount) {
+//                          max_multiprocessors = properties.multiProcessorCount;
+//                          max_device = device;
+//                  }
+//                  printf("Found GPU device # %i\n",device);
+//          }
+//          //cudaSetDevice(max_device);
+//
+//          //set random device!
+//          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs
+//
+//          cudaGetDeviceProperties( &prop, max_device );
+//          //debug code
+//          if (prop.canMapHostMemory != 1) {
+//              printf( "Device can not map memory.\n" );
+//          }
+//          printf("L2 Cache size= %u \n",prop.l2CacheSize);
+//          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+//          printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+//          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+//          printf("deviceOverlap= %i \n",prop.deviceOverlap);
+//  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+//    }else{
+//    	    int whichDevice;
+//    	    cudaGetDevice( &whichDevice );
+//    	    cudaGetDeviceProperties( &prop, whichDevice );
+//    	    //debug code
+//    	    if (prop.canMapHostMemory != 1) {
+//    	        printf( "Device can not map memory.\n" );
+//    	    }
+//
+//    	    printf("L2 Cache size= %u \n",prop.l2CacheSize);
+//    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+//    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+//    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+//    	    printf("deviceOverlap= %i \n",prop.deviceOverlap);
+//    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+//    }
+
+	//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
+
+
+    // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
+
+    size_t size = signal_length_samples * sizeof(GPU_Complex);
+
+	checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
+	//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
+	checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
+
+	// old version: all local codes are independent vectors
+	//checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size*n_correlators));
+
+	// new version: only one vector with extra samples to shift the local code for the correlator set
+	// Required: The last correlator tap in d_shifts_samples has the largest sample shift
+    size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex);
+	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes));
+	checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators));
+
+	//scalars
+	checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
+
+    // Launch the Vector Add CUDA Kernel
+	threadsPerBlock = 256;
+    blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
+
+	cudaStreamCreate (&stream1) ;
+	cudaStreamCreate (&stream2) ;
+	return true;
+}
+
+
+bool cuda_multicorrelator::init_cuda_integrated_resampler(
+		const int argc, const char **argv,
+		int signal_length_samples,
+		int code_length_chips,
+		int n_correlators
+		)
+{
+	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
+//	findCudaDevice(argc, (const char **)argv);
+//      cudaDeviceProp  prop;
+//    int num_devices, device;
+//    cudaGetDeviceCount(&num_devices);
+//    if (num_devices > 1) {
+//          int max_multiprocessors = 0, max_device = 0;
+//          for (device = 0; device < num_devices; device++) {
+//                  cudaDeviceProp properties;
+//                  cudaGetDeviceProperties(&properties, device);
+//                  if (max_multiprocessors < properties.multiProcessorCount) {
+//                          max_multiprocessors = properties.multiProcessorCount;
+//                          max_device = device;
+//                  }
+//                  printf("Found GPU device # %i\n",device);
+//          }
+//          //cudaSetDevice(max_device);
+//
+//          //set random device!
+//          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs
+//
+//          cudaGetDeviceProperties( &prop, max_device );
+//          //debug code
+//          if (prop.canMapHostMemory != 1) {
+//              printf( "Device can not map memory.\n" );
+//          }
+//          printf("L2 Cache size= %u \n",prop.l2CacheSize);
+//          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+//          printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+//          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+//          printf("deviceOverlap= %i \n",prop.deviceOverlap);
+//  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+//    }else{
+//    	    int whichDevice;
+//    	    cudaGetDevice( &whichDevice );
+//    	    cudaGetDeviceProperties( &prop, whichDevice );
+//    	    //debug code
+//    	    if (prop.canMapHostMemory != 1) {
+//    	        printf( "Device can not map memory.\n" );
+//    	    }
+//
+//    	    printf("L2 Cache size= %u \n",prop.l2CacheSize);
+//    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+//    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+//    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+//    	    printf("deviceOverlap= %i \n",prop.deviceOverlap);
+//    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+//    }
+
+	//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
+
+    // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
+
+    size_t size = signal_length_samples * sizeof(GPU_Complex);
+
+	checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
+	checkCudaErrors(cudaMemset(d_sig_in,0,size));
+
+	//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
+	checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
+	checkCudaErrors(cudaMemset(d_sig_doppler_wiped,0,size));
+
+	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips));
+	checkCudaErrors(cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips));
+
+    d_code_length_chips=code_length_chips;
+
+	checkCudaErrors(cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators));
+	checkCudaErrors(cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators));
+
+	//scalars
+	checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
+	checkCudaErrors(cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators));
+
+    // Launch the Vector Add CUDA Kernel
+	threadsPerBlock = 256;
+    blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
+
+	cudaStreamCreate (&stream1) ;
+	cudaStreamCreate (&stream2) ;
+	return true;
+}
+
+bool cuda_multicorrelator::set_local_code_and_taps(
+		int code_length_chips,
+		const std::complex<float>* local_codes_in,
+		float *shifts_chips,
+		int n_correlators
+		)
+{
+    // local code CPU -> GPU copy memory
+    checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1));
+    d_code_length_chips=(float)code_length_chips;
+
+    // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
+    checkCudaErrors(cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
+                                    cudaMemcpyHostToDevice,stream1));
+
+	return true;
+}
+
+
+
+bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
+		std::complex<float>* corr_out,
+		const std::complex<float>* sig_in,
+		const std::complex<float>* local_codes_in,
+		float rem_carrier_phase_in_rad,
+		float phase_step_rad,
+		const int *shifts_samples,
+		int signal_length_samples,
+		int n_correlators)
+	{
+
+	size_t memSize = signal_length_samples * sizeof(std::complex<float>);
+
+	// input signal CPU -> GPU copy memory
+
+    checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
+                                    cudaMemcpyHostToDevice, stream1));
+
+    //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
+    //checkCudaErrors(cudaMemcpyAsync(d_nco_in, nco_in, memSize,
+    //                                cudaMemcpyHostToDevice, stream1));
+
+
+	// old version: all local codes are independent vectors
+    //checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize*n_correlators,
+    //                                cudaMemcpyHostToDevice, stream2));
+
+	// new version: only one vector with extra samples to shift the local code for the correlator set
+	// Required: The last correlator tap in d_shifts_samples has the largest sample shift
+
+    // local code CPU -> GPU copy memory
+    checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, memSize+sizeof(std::complex<float>)*shifts_samples[n_correlators-1],
+                                    cudaMemcpyHostToDevice, stream2));
+    // Correlator shifts vector CPU -> GPU copy memory
+    checkCudaErrors(cudaMemcpyAsync(d_shifts_samples, shifts_samples, sizeof(int)*n_correlators,
+                                    cudaMemcpyHostToDevice, stream2));
+
+
+    //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+    CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
+
+
+    //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
+
+    //wait for Doppler wipeoff end...
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+    checkCudaErrors(cudaStreamSynchronize(stream2));
+    //checkCudaErrors(cudaDeviceSynchronize());
+
+    //old
+//    scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
+//    		d_corr_out,
+//    		d_sig_doppler_wiped,
+//    		d_local_codes_in,
+//            3,
+//            signal_length_samples
+//        );
+
+    //new
+    //launch the multitap correlator
+    scalarProdGPUCPXxN_shifts<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
+			d_corr_out,
+			d_sig_doppler_wiped,
+			d_local_codes_in,
+			d_shifts_samples,
+			n_correlators,
+			signal_length_samples
+		);
+    checkCudaErrors(cudaGetLastError());
+    //wait for correlators end...
+    checkCudaErrors(cudaStreamSynchronize(stream2));
+    // Copy the device result vector in device memory to the host result vector
+    // in host memory.
+
+    //scalar products (correlators outputs)
+    checkCudaErrors(cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+            cudaMemcpyDeviceToHost));
+    return true;
+}
+
+bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
+		std::complex<float>* corr_out,
+		const std::complex<float>* sig_in,
+		float rem_carrier_phase_in_rad,
+		float phase_step_rad,
+        float code_phase_step_chips,
+        float rem_code_phase_chips,
+		int signal_length_samples,
+		int n_correlators)
+	{
+
+	size_t memSize = signal_length_samples * sizeof(std::complex<float>);
+	// input signal CPU -> GPU copy memory
+    checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
+                                    cudaMemcpyHostToDevice, stream2));
+
+    //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
+
+    //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
+    checkCudaErrors(cudaStreamSynchronize(stream2));
+
+    CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
+
+    //wait for Doppler wipeoff end...
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+    checkCudaErrors(cudaStreamSynchronize(stream2));
+
+    //launch the multitap correlator with integrated local code resampler!
+
+    scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
+			d_corr_out,
+			d_sig_doppler_wiped,
+			d_local_codes_in,
+			d_shifts_chips,
+			d_code_length_chips,
+	        code_phase_step_chips,
+	        rem_code_phase_chips,
+			n_correlators,
+			signal_length_samples
+		);
+
+    checkCudaErrors(cudaGetLastError());
+    //wait for correlators end...
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+    // Copy the device result vector in device memory to the host result vector
+    // in host memory.
+
+    //scalar products (correlators outputs)
+    checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+            cudaMemcpyDeviceToHost,stream1));
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+    return true;
+}
+
+
+cuda_multicorrelator::cuda_multicorrelator()
+{
+	d_sig_in=NULL;
+	d_nco_in=NULL;
+	d_sig_doppler_wiped=NULL;
+	d_local_codes_in=NULL;
+	d_shifts_samples=NULL;
+	d_shifts_chips=NULL;
+	d_corr_out=NULL;
+	threadsPerBlock=0;
+	blocksPerGrid=0;
+	d_code_length_chips=0;
+}
+
+bool cuda_multicorrelator::free_cuda()
+{
+	// Free device global memory
+	if (d_sig_in!=NULL) cudaFree(d_sig_in);
+	if (d_nco_in!=NULL) cudaFree(d_nco_in);
+	if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped);
+	if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in);
+	if (d_corr_out!=NULL) cudaFree(d_corr_out);
+
+
+	if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples);
+	if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips);
+
+
+	cudaStreamDestroy(stream1) ;
+	cudaStreamDestroy(stream2) ;
+
+    // Reset the device and exit
+    // cudaDeviceReset causes the driver to clean up all state. While
+    // not mandatory in normal operation, it is good practice.  It is also
+    // needed to ensure correct operation when the application is being
+    // profiled. Calling cudaDeviceReset causes all profile data to be
+    // flushed before the application exits
+	//checkCudaErrors(cudaDeviceReset());
+	return true;
+}
+
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -0,0 +1,171 @@
+/*!
+ * \file cuda_multicorrelator.h
+ * \brief High optimized CUDA GPU vector multiTAP correlator class
+ * \authors <ul>
+ *          <li> Javier Arribas, 2015. jarribas(at)cttc.es
+ *          </ul>
+ *
+ * Class that implements a high optimized vector multiTAP correlator class for NVIDIA CUDA GPUs
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef CUDA_MULTICORRELATOR_H_
+#define CUDA_MULTICORRELATOR_H_
+
+
+#ifdef __CUDACC__
+#define CUDA_CALLABLE_MEMBER_GLOBAL __global__
+#define CUDA_CALLABLE_MEMBER_DEVICE __device__
+#else
+#define CUDA_CALLABLE_MEMBER_GLOBAL
+#define CUDA_CALLABLE_MEMBER_DEVICE
+#endif
+
+#include <complex>
+
+#include <cuda.h>
+// CUDA runtime
+#include <cuda_runtime.h>
+
+// GPU new internal data types for complex numbers
+
+struct GPU_Complex {
+	 float r;
+	 float i;
+	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex() {};
+	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex( float a, float b ) : r(a), i(b) {}
+	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
+		 return r * r + i * i;
+	 }
+	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator*(const GPU_Complex& a) {
+		#ifdef __CUDACC__
+		 return GPU_Complex(__fmul_rn(r,a.r) - __fmul_rn(i,a.i), __fmul_rn(i,a.r) + __fmul_rn(r,a.i));
+		#else
+		 return GPU_Complex(r*a.r - i*a.i, i*a.r + r*a.i);
+		#endif
+	 }
+	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex operator+(const GPU_Complex& a) {
+		 return GPU_Complex(r+a.r, i+a.i);
+	 }
+	 CUDA_CALLABLE_MEMBER_DEVICE void operator+=(const GPU_Complex& a) {
+		 r+=a.r;
+		 i+=a.i;
+	 }
+	 CUDA_CALLABLE_MEMBER_DEVICE void multiply_acc(const GPU_Complex& a, const GPU_Complex& b)
+	 {
+		 //c=a*b+c
+		 //real part
+		 //c.r=(a.r*b.r - a.i*b.i)+c.r
+		#ifdef __CUDACC__
+			 r=__fmaf_rn(a.r,b.r,r);
+			 r=__fmaf_rn(-a.i,b.i,r);
+			 //imag part
+			 i=__fmaf_rn(a.i,b.r,i);
+			 i=__fmaf_rn(a.r,b.i,i);
+		#else
+			 r=(a.r*b.r - a.i*b.i)+r;
+			 i=(a.i*b.r - a.r*b.i)+i;
+		#endif
+
+	 }
+};
+
+struct GPU_Complex_Short {
+	 float r;
+	 float i;
+	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short( short int a, short int b ) : r(a), i(b) {}
+	 CUDA_CALLABLE_MEMBER_DEVICE float magnitude2( void ) {
+		 return r * r + i * i;
+	 }
+	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator*(const GPU_Complex_Short& a) {
+		 return GPU_Complex_Short(r*a.r - i*a.i, i*a.r + r*a.i);
+	 }
+	 CUDA_CALLABLE_MEMBER_DEVICE GPU_Complex_Short operator+(const GPU_Complex_Short& a) {
+		 return GPU_Complex_Short(r+a.r, i+a.i);
+	 }
+};
+/*!
+ * \brief Class that implements carrier wipe-off and correlators using NVIDIA CUDA GPU accelerators.
+ */
+class cuda_multicorrelator
+{
+public:
+	cuda_multicorrelator();
+	bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
+	bool init_cuda_integrated_resampler(
+			const int argc, const char **argv,
+			int signal_length_samples,
+			int code_length_chips,
+			int n_correlators
+			);
+	bool set_local_code_and_taps(
+			int code_length_chips,
+			const std::complex<float>* local_codes_in,
+			float *shifts_chips,
+			int n_correlators
+			);
+	bool free_cuda();
+	bool Carrier_wipeoff_multicorrelator_cuda(
+			std::complex<float>* corr_out,
+			const std::complex<float>* sig_in,
+			const std::complex<float>* local_codes_in,
+			float rem_carrier_phase_in_rad,
+			float phase_step_rad,
+			const int *shifts_samples,
+			int signal_length_samples,
+			int n_correlators);
+	bool Carrier_wipeoff_multicorrelator_resampler_cuda(
+			std::complex<float>* corr_out,
+			const std::complex<float>* sig_in,
+			float rem_carrier_phase_in_rad,
+			float phase_step_rad,
+	        float code_phase_step_chips,
+	        float rem_code_phase_chips,
+			int signal_length_samples,
+			int n_correlators);
+private:
+	// Allocate the device input vectors
+	GPU_Complex *d_sig_in;
+	GPU_Complex *d_nco_in;
+	GPU_Complex *d_sig_doppler_wiped;
+	GPU_Complex *d_local_codes_in;
+	GPU_Complex *d_corr_out;
+	int *d_shifts_samples;
+	float *d_shifts_chips;
+	float d_code_length_chips;
+
+	int threadsPerBlock;
+	int blocksPerGrid;
+
+	cudaStream_t stream1;
+	cudaStream_t stream2;
+	int num_gpu_devices;
+	int selected_device;
+
+};
+
+
+#endif /* CUDA_MULTICORRELATOR_H_ */
--- a/src/algorithms/tracking/libs/cudahelpers/exception.h
+++ b/src/algorithms/tracking/libs/cudahelpers/exception.h
@@ -0,0 +1,151 @@
+/*
+* Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+/* CUda UTility Library */
+#ifndef _EXCEPTION_H_
+#define _EXCEPTION_H_
+
+// includes, system
+#include <exception>
+#include <stdexcept>
+#include <iostream>
+#include <stdlib.h>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template<class Std_Exception>
+class Exception : public Std_Exception
+{
+    public:
+
+        //! @brief Static construction interface
+        //! @return Alwayss throws ( Located_Exception<Exception>)
+        //! @param file file in which the Exception occurs
+        //! @param line line in which the Exception occurs
+        //! @param detailed details on the code fragment causing the Exception
+        static void throw_it(const char *file,
+                             const int line,
+                             const char *detailed = "-");
+
+        //! Static construction interface
+        //! @return Alwayss throws ( Located_Exception<Exception>)
+        //! @param file file in which the Exception occurs
+        //! @param line line in which the Exception occurs
+        //! @param detailed details on the code fragment causing the Exception
+        static void throw_it(const char *file,
+                             const int line,
+                             const std::string &detailed);
+
+        //! Destructor
+        virtual ~Exception() throw();
+
+    private:
+
+        //! Constructor, default (private)
+        Exception();
+
+        //! Constructor, standard
+        //! @param str string returned by what()
+        Exception(const std::string &str);
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template<class Exception_Typ>
+inline void
+handleException(const Exception_Typ &ex)
+{
+    std::cerr << ex.what() << std::endl;
+
+    exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION( msg) \
+    Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION( msg) \
+    Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION( msg) \
+    Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template<class Std_Exception>
+void
+Exception<Std_Exception>::
+throw_it(const char *file, const int line, const char *detailed)
+{
+    std::stringstream s;
+
+    // Quiet heavy-weight but exceptions are not for
+    // performance / release versions
+    s << "Exception in file '" << file << "' in line " << line << "\n"
+      << "Detailed description: " << detailed << "\n";
+
+    throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template<class Std_Exception>
+void
+Exception<Std_Exception>::
+throw_it(const char *file, const int line, const std::string &msg)
+{
+    throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::Exception() :
+    Std_Exception("Unknown Exception.\n")
+{ }
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) :
+    Std_Exception(s)
+{ }
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() { }
+
+// functions, exported
+
+#endif // #ifndef _EXCEPTION_H_
+
--- a/src/algorithms/tracking/libs/cudahelpers/helper_cuda.h
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_cuda.h
--- a/src/algorithms/tracking/libs/cudahelpers/helper_cuda_drvapi.h
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_drvapi.h
@@ -0,0 +1,517 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// Helper functions for CUDA Driver API error handling (make sure that CUDA_H is included in your projects)
+#ifndef HELPER_CUDA_DRVAPI_H
+#define HELPER_CUDA_DRVAPI_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <helper_string.h>
+#include <drvapi_error_string.h>
+
+#ifndef MAX
+#define MAX(a,b) (a > b ? a : b)
+#endif
+
+#ifndef HELPER_CUDA_H
+inline int ftoi(float value)
+{
+    return (value >= 0 ? (int)(value + 0.5) : (int)(value - 0.5));
+}
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions
+
+// add a level of protection to the CUDA SDK samples, let's force samples to explicitly include CUDA.H
+#ifdef  __cuda_cuda_h__
+// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
+#ifndef checkCudaErrors
+#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
+
+// These are the inline versions for all of the SDK helper functions
+inline void __checkCudaErrors(CUresult err, const char *file, const int line)
+{
+    if (CUDA_SUCCESS != err)
+    {
+        fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n",
+                err, getCudaDrvErrorString(err), file, line);
+        exit(EXIT_FAILURE);
+    }
+}
+#endif
+
+#ifdef getLastCudaDrvErrorMsg
+#undef getLastCudaDrvErrorMsg
+#endif
+
+#define getLastCudaDrvErrorMsg(msg)           __getLastCudaDrvErrorMsg  (msg, __FILE__, __LINE__)
+
+inline void __getLastCudaDrvErrorMsg(const char *msg, const char *file, const int line)
+{
+    CUresult err = cuCtxSynchronize();
+
+    if (CUDA_SUCCESS != err)
+    {
+        fprintf(stderr, "getLastCudaDrvErrorMsg -> %s", msg);
+        fprintf(stderr, "getLastCudaDrvErrorMsg -> cuCtxSynchronize API error = %04d \"%s\" in file <%s>, line %i.\n",
+                err, getCudaDrvErrorString(err), file, line);
+        exit(EXIT_FAILURE);
+    }
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
+{
+    CUresult error_result = cuDeviceGetAttribute(attribute, device_attribute, device);
+
+    if (error_result != CUDA_SUCCESS)
+    {
+        printf("cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result));
+        exit(EXIT_SUCCESS);
+    }
+}
+#endif
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2CoresDRV(int major, int minor)
+{
+    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+    typedef struct
+    {
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+        int Cores;
+    } sSMtoCores;
+
+    sSMtoCores nGpuArchCoresPerSM[] =
+    {
+        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
+        { 0x32, 192}, // Kepler Generation (SM 3.2) GK10x class
+        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
+        { 0x37, 192}, // Kepler Generation (SM 3.7) GK21x class
+        { 0x50, 128}, // Maxwell Generation (SM 5.0) GM10x class
+        { 0x52, 128}, // Maxwell Generation (SM 5.2) GM20x class
+        {   -1, -1 }
+    };
+
+    int index = 0;
+
+    while (nGpuArchCoresPerSM[index].SM != -1)
+    {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
+        {
+            return nGpuArchCoresPerSM[index].Cores;
+        }
+
+        index++;
+    }
+
+    // If we don't find the values, we default use the previous one to run properly
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[index-1].Cores);
+    return nGpuArchCoresPerSM[index-1].Cores;
+}
+// end of GPU Architecture definitions
+
+#ifdef __cuda_cuda_h__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
+{
+    int cuDevice = 0;
+    int deviceCount = 0;
+    CUresult err = cuInit(0);
+
+    if (CUDA_SUCCESS == err)
+    {
+        checkCudaErrors(cuDeviceGetCount(&deviceCount));
+    }
+
+    if (deviceCount == 0)
+    {
+        fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
+        exit(EXIT_FAILURE);
+    }
+
+    int dev = 0;
+    dev = getCmdLineArgumentInt(ARGC, (const char **) ARGV, "device=");
+
+    if (dev < 0)
+    {
+        dev = 0;
+    }
+
+    if (dev > deviceCount-1)
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
+        fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
+        fprintf(stderr, "\n");
+        return -dev;
+    }
+
+    checkCudaErrors(cuDeviceGet(&cuDevice, dev));
+    char name[100];
+    cuDeviceGetName(name, 100, cuDevice);
+
+    int computeMode;
+    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+
+    if (computeMode == CU_COMPUTEMODE_PROHIBITED)
+    {
+        fprintf(stderr, "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no threads can use this CUDA Device.\n");
+        return -1;
+    }
+
+    if (checkCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == false)
+    {
+        printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
+    }
+
+    return dev;
+}
+
+// This function returns the best GPU based on performance
+inline int gpuGetMaxGflopsDeviceIdDRV()
+{
+    CUdevice current_device  = 0;
+    CUdevice max_perf_device = 0;
+    int device_count     = 0;
+    int sm_per_multiproc = 0;
+    unsigned long long max_compute_perf = 0;
+    int best_SM_arch = 0;
+    int major = 0;
+    int minor = 0;
+    int multiProcessorCount;
+    int clockRate;
+    int devices_prohibited = 0;
+
+    cuInit(0);
+    checkCudaErrors(cuDeviceGetCount(&device_count));
+
+    if (device_count == 0)
+    {
+        fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Find the best major SM Architecture GPU device
+    while (current_device < device_count)
+    {
+        checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
+
+        if (major > 0 && major < 9999)
+        {
+            best_SM_arch = MAX(best_SM_arch, major);
+        }
+
+        current_device++;
+    }
+
+    // Find the best CUDA capable GPU device
+    current_device = 0;
+
+    while (current_device < device_count)
+    {
+        checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
+                                             CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                                             current_device));
+        checkCudaErrors(cuDeviceGetAttribute(&clockRate,
+                                             CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
+                                             current_device));
+        checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
+
+        int computeMode;
+        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
+
+        if (computeMode != CU_COMPUTEMODE_PROHIBITED)
+        {
+            if (major == 9999 && minor == 9999)
+            {
+                sm_per_multiproc = 1;
+            }
+            else
+            {
+                sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
+            }
+
+            unsigned long long compute_perf = (unsigned long long) (multiProcessorCount * sm_per_multiproc * clockRate);
+
+            if (compute_perf  > max_compute_perf)
+            {
+                // If we find GPU with SM major > 2, search only these
+                if (best_SM_arch > 2)
+                {
+                    // If our device==dest_SM_arch, choose this, or else pass
+                    if (major == best_SM_arch)
+                    {
+                        max_compute_perf  = compute_perf;
+                        max_perf_device   = current_device;
+                    }
+                }
+                else
+                {
+                    max_compute_perf  = compute_perf;
+                    max_perf_device   = current_device;
+                }
+            }
+        }
+        else
+        {
+            devices_prohibited++;
+        }
+
+        ++current_device;
+    }
+
+    if (devices_prohibited == device_count)
+    {    
+        fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode prohibited.\n");
+        exit(EXIT_FAILURE);
+    }    
+
+    return max_perf_device;
+}
+
+// This function returns the best Graphics GPU based on performance
+inline int gpuGetMaxGflopsGLDeviceIdDRV()
+{
+    CUdevice current_device = 0, max_perf_device = 0;
+    int device_count     = 0, sm_per_multiproc = 0;
+    int max_compute_perf = 0, best_SM_arch     = 0;
+    int major = 0, minor = 0, multiProcessorCount, clockRate;
+    int bTCC = 0;
+    int devices_prohibited = 0;
+    char deviceName[256];
+
+    cuInit(0);
+    checkCudaErrors(cuDeviceGetCount(&device_count));
+
+    if (device_count == 0)
+    {
+        fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: no devices supporting CUDA\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Find the best major SM Architecture GPU device that are graphics devices
+    while (current_device < device_count)
+    {
+        checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
+        checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
+
+#if CUDA_VERSION >= 3020
+        checkCudaErrors(cuDeviceGetAttribute(&bTCC,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
+#else
+
+        // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
+        if (deviceName[0] == 'T')
+        {
+            bTCC = 1;
+        }
+
+#endif
+
+        int computeMode;
+        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
+
+        if (computeMode != CU_COMPUTEMODE_PROHIBITED)
+        {
+            if (!bTCC)
+            {
+                if (major > 0 && major < 9999)
+                {
+                    best_SM_arch = MAX(best_SM_arch, major);
+                }
+            }
+        }
+        else
+        {
+            devices_prohibited++;
+        }
+
+        current_device++;
+    }
+
+    if (devices_prohibited == device_count)
+    {
+        fprintf(stderr, "gpuGetMaxGflopsGLDeviceIdDRV error: all devices have compute mode prohibited.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // Find the best CUDA capable GPU device
+    current_device = 0;
+
+    while (current_device < device_count)
+    {
+        checkCudaErrors(cuDeviceGetAttribute(&multiProcessorCount,
+                                             CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                                             current_device));
+        checkCudaErrors(cuDeviceGetAttribute(&clockRate,
+                                             CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
+                                             current_device));
+        checkCudaErrors(cuDeviceComputeCapability(&major, &minor, current_device));
+
+#if CUDA_VERSION >= 3020
+        checkCudaErrors(cuDeviceGetAttribute(&bTCC,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device));
+#else
+
+        // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
+        if (deviceName[0] == 'T')
+        {
+            bTCC = 1;
+        }
+
+#endif
+
+        int computeMode;
+        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
+
+        if (computeMode != CU_COMPUTEMODE_PROHIBITED)
+        {
+            if (major == 9999 && minor == 9999)
+            {
+                sm_per_multiproc = 1;
+            }
+            else
+            {
+                sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
+            }
+
+            // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor
+            if (!bTCC)   // Is this GPU running the TCC driver?  If so we pass on this
+            {
+                int compute_perf  = multiProcessorCount * sm_per_multiproc * clockRate;
+
+                if (compute_perf  > max_compute_perf)
+                {
+                    // If we find GPU with SM major > 2, search only these
+                    if (best_SM_arch > 2)
+                    {
+                        // If our device = dest_SM_arch, then we pick this one
+                        if (major == best_SM_arch)
+                        {
+                            max_compute_perf  = compute_perf;
+                            max_perf_device   = current_device;
+                        }
+                    }
+                    else
+                    {
+                        max_compute_perf  = compute_perf;
+                        max_perf_device   = current_device;
+                    }
+                }
+            }
+        }
+
+        ++current_device;
+    }
+
+    return max_perf_device;
+}
+
+// General initialization call to pick the best CUDA Device
+inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
+{
+    CUdevice cuDevice;
+    int devID = 0;
+
+    // If the command-line has a device number specified, use it
+    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
+    {
+        devID = gpuDeviceInitDRV(argc, argv);
+
+        if (devID < 0)
+        {
+            printf("exiting...\n");
+            exit(EXIT_SUCCESS);
+        }
+    }
+    else
+    {
+        // Otherwise pick the device with highest Gflops/s
+        char name[100];
+        devID = gpuGetMaxGflopsDeviceIdDRV();
+        checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+        cuDeviceGetName(name, 100, cuDevice);
+        printf("> Using CUDA Device [%d]: %s\n", devID, name);
+    }
+
+    cuDeviceGet(&cuDevice, devID);
+
+    return cuDevice;
+}
+
+// This function will pick the best CUDA device available with OpenGL interop
+inline CUdevice findCudaGLDeviceDRV(int argc, const char **argv)
+{
+    CUdevice cuDevice;
+    int devID = 0;
+
+    // If the command-line has a device number specified, use it
+    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
+    {
+        devID = gpuDeviceInitDRV(argc, (const char **)argv);
+
+        if (devID < 0)
+        {
+            printf("no CUDA capable devices found, exiting...\n");
+            exit(EXIT_SUCCESS);
+        }
+    }
+    else
+    {
+        char name[100];
+        // Otherwise pick the device with highest Gflops/s
+        devID = gpuGetMaxGflopsGLDeviceIdDRV();
+        checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+        cuDeviceGetName(name, 100, cuDevice);
+        printf("> Using CUDA/GL Device [%d]: %s\n", devID, name);
+    }
+
+    return devID;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
+{
+    CUdevice cuDevice;
+    char name[256];
+    int major = 0, minor = 0;
+
+    checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+    checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
+    checkCudaErrors(cuDeviceComputeCapability(&major, &minor, devID));
+
+    if ((major > major_version) ||
+        (major == major_version && minor >= minor_version))
+    {
+        printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
+        return true;
+    }
+    else
+    {
+        printf("No GPU device was found that can support CUDA compute capability %d.%d.\n", major_version, minor_version);
+        return false;
+    }
+}
+#endif
+
+// end of CUDA Helper Functions
+
+#endif
--- a/src/algorithms/tracking/libs/cudahelpers/helper_cuda_gl.h
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_cuda_gl.h
@@ -0,0 +1,165 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#ifndef HELPER_CUDA_GL_H
+#define HELPER_CUDA_GL_H
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// includes, graphics
+#if defined (__APPLE__) || defined(MACOSX)
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#else
+#include <GL/gl.h>
+#include <GL/glu.h>
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset()
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+#ifdef __CUDA_GL_INTEROP_H__
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA OpenGL Helper functions
+
+inline int gpuGLDeviceInit(int ARGC, const char **ARGV)
+{
+    int deviceCount;
+    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+
+    if (deviceCount == 0)
+    {
+        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    int dev = 0;
+    dev = getCmdLineArgumentInt(ARGC, ARGV, "device=");
+
+    if (dev < 0)
+    {
+        dev = 0;
+    }
+
+    if (dev > deviceCount-1)
+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
+        fprintf(stderr, ">> gpuGLDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
+        fprintf(stderr, "\n");
+        return -dev;
+    }
+
+    cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
+
+    if (deviceProp.computeMode == cudaComputeModeProhibited)
+    {
+        fprintf(stderr, "Error: device is running in <Compute Mode Prohibited>, no threads can use ::cudaSetDevice().\n");
+        return -1;
+    }
+
+    if (deviceProp.major < 1)
+    {
+        fprintf(stderr, "Error: device does not support CUDA.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    if (checkCmdLineFlag(ARGC, ARGV, "quiet") == false)
+    {
+        fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);
+    }
+
+    checkCudaErrors(cudaGLSetGLDevice(dev));
+    return dev;
+}
+
+// This function will pick the best CUDA device available with OpenGL interop
+inline int findCudaGLDevice(int argc, const char **argv)
+{
+    int devID = 0;
+
+    // If the command-line has a device number specified, use it
+    if (checkCmdLineFlag(argc, (const char **)argv, "device"))
+    {
+        devID = gpuGLDeviceInit(argc, (const char **)argv);
+
+        if (devID < 0)
+        {
+            printf("no CUDA capable devices found, exiting...\n");
+            DEVICE_RESET
+            exit(EXIT_SUCCESS);
+        }
+    }
+    else
+    {
+        // Otherwise pick the device with highest Gflops/s
+        devID = gpuGetMaxGflopsDeviceId();
+        cudaGLSetGLDevice(devID);
+    }
+
+    return devID;
+}
+
+////////////////////////////////////////////////////////////////////////////
+//! Check for OpenGL error
+//! @return bool if no GL error has been encountered, otherwise 0
+//! @param file  __FILE__ macro
+//! @param line  __LINE__ macro
+//! @note The GL error is listed on stderr
+//! @note This function should be used via the CHECK_ERROR_GL() macro
+////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkCheckErrorGL(const char *file, const int line)
+{
+    bool ret_val = true;
+
+    // check for error
+    GLenum gl_error = glGetError();
+
+    if (gl_error != GL_NO_ERROR)
+    {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        char tmpStr[512];
+        // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
+        // when the user double clicks on the error line in the Output pane. Like any compile error.
+        sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", file, line, gluErrorString(gl_error));
+        fprintf(stderr, "%s", tmpStr);
+#endif
+        fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line);
+        fprintf(stderr, "%s\n", gluErrorString(gl_error));
+        ret_val = false;
+    }
+
+    return ret_val;
+}
+
+#define SDK_CHECK_ERROR_GL()                                              \
+    if( false == sdkCheckErrorGL( __FILE__, __LINE__)) {                  \
+        DEVICE_RESET                                                      \
+        exit(EXIT_FAILURE);                                               \
+    }
+#endif
+
+#endif
--- a/src/algorithms/tracking/libs/cudahelpers/helper_functions.h
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_functions.h
@@ -0,0 +1,42 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, image helpers, etc)
+#ifndef HELPER_FUNCTIONS_H
+#define HELPER_FUNCTIONS_H
+
+#ifdef WIN32
+#pragma warning(disable:4996)
+#endif
+
+// includes, project
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <assert.h>
+#include <exception.h>
+#include <math.h>
+
+#include <fstream>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+// includes, timer, string parsing, image helpers
+#include <helper_timer.h>   // helper functions for timers
+#include <helper_string.h>  // helper functions for string parsing
+#include <helper_image.h>   // helper functions for image compare, dump, data comparisons
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif //  HELPER_FUNCTIONS_H
--- a/src/algorithms/tracking/libs/cudahelpers/helper_image.h
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_image.h
--- a/src/algorithms/tracking/libs/cudahelpers/helper_math.h
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_math.h
--- a/src/algorithms/tracking/libs/cudahelpers/helper_string.h
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_string.h
@@ -0,0 +1,516 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef STRING_HELPER_H
+#define STRING_HELPER_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP  _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP  strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle,filename,mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string)
+{
+    int string_start = 0;
+
+    while (string[string_start] == delimiter)
+    {
+        string_start++;
+    }
+
+    if (string_start >= (int)strlen(string)-1)
+    {
+        return 0;
+    }
+
+    return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension)
+{
+    int string_length = (int)strlen(filename);
+
+    while (filename[string_length--] != '.')
+    {
+        if (string_length == 0)
+            break;
+    }
+
+    if (string_length > 0) string_length += 2;
+
+    if (string_length == 0)
+        *extension = NULL;
+    else
+        *extension = &filename[string_length];
+
+    return string_length;
+}
+
+
+inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+
+            const char *equal_pos = strchr(string_argv, '=');
+            int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+            int length = (int)strlen(string_ref);
+
+            if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length))
+            {
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    *value = (T)atoi(&string_argv[length + auto_inc]);
+                }
+
+                bFound = true;
+                i=argc;
+            }
+        }
+    }
+
+    return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+    int value = -1;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = atoi(&string_argv[length + auto_inc]);
+                }
+                else
+                {
+                    value = 0;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound)
+    {
+        return value;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref)
+{
+    bool bFound = false;
+    float value = -1;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            const char *string_argv = &argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                if (length+1 <= (int)strlen(string_argv))
+                {
+                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+                    value = (float)atof(&string_argv[length + auto_inc]);
+                }
+                else
+                {
+                    value = 0.f;
+                }
+
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (bFound)
+    {
+        return value;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref, char **string_retval)
+{
+    bool bFound = false;
+
+    if (argc >= 1)
+    {
+        for (int i=1; i < argc; i++)
+        {
+            int string_start = stringRemoveDelimiter('-', argv[i]);
+            char *string_argv = (char *)&argv[i][string_start];
+            int length = (int)strlen(string_ref);
+
+            if (!STRNCASECMP(string_argv, string_ref, length))
+            {
+                *string_retval = &string_argv[length+1];
+                bFound = true;
+                continue;
+            }
+        }
+    }
+
+    if (!bFound)
+    {
+        *string_retval = NULL;
+    }
+
+    return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename, const char *executable_path)
+{
+    // <executable_name> defines a variable that is replaced with the name of the executable
+
+    // Typical relative search paths to locate needed companion files (e.g. sample input data, or JIT source files)
+    // The origin for the relative search may be the .exe file, a .bat file launching an .exe, a browser .exe launching the .exe or .bat, etc
+    const char *searchPath[] =
+    {
+        "./",                                       // same dir
+        "./common/",                                // "/common/" subdir
+        "./common/data/",                           // "/common/data/" subdir
+        "./data/",                                  // "/data/" subdir
+        "./src/",                                   // "/src/" subdir
+        "./src/<executable_name>/data/",            // "/src/<executable_name>/data/" subdir
+        "./inc/",                                   // "/inc/" subdir
+        "./0_Simple/",                              // "/0_Simple/" subdir
+        "./1_Utilities/",                           // "/1_Utilities/" subdir
+        "./2_Graphics/",                            // "/2_Graphics/" subdir
+        "./3_Imaging/",                             // "/3_Imaging/" subdir
+        "./4_Finance/",                             // "/4_Finance/" subdir
+        "./5_Simulations/",                         // "/5_Simulations/" subdir
+        "./6_Advanced/",                            // "/6_Advanced/" subdir
+        "./7_CUDALibraries/",                       // "/7_CUDALibraries/" subdir
+        "./8_Android/",                             // "/8_Android/" subdir
+        "./samples/",                               // "/samples/" subdir
+
+        "../",                                      // up 1 in tree
+        "../common/",                               // up 1 in tree, "/common/" subdir
+        "../common/data/",                          // up 1 in tree, "/common/data/" subdir
+        "../data/",                                 // up 1 in tree, "/data/" subdir
+        "../src/",                                  // up 1 in tree, "/src/" subdir
+        "../inc/",                                  // up 1 in tree, "/inc/" subdir
+
+        "../0_Simple/<executable_name>/data/",       // up 1 in tree, "/0_Simple/<executable_name>/" subdir
+        "../1_Utilities/<executable_name>/data/",    // up 1 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../2_Graphics/<executable_name>/data/",     // up 1 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../3_Imaging/<executable_name>/data/",      // up 1 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../4_Finance/<executable_name>/data/",      // up 1 in tree, "/4_Finance/<executable_name>/" subdir
+        "../5_Simulations/<executable_name>/data/",  // up 1 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../6_Advanced/<executable_name>/data/",     // up 1 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../7_CUDALibraries/<executable_name>/data/",// up 1 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../8_Android/<executable_name>/data/",      // up 1 in tree, "/8_Android/<executable_name>/" subdir
+        "../samples/<executable_name>/data/",        // up 1 in tree, "/samples/<executable_name>/" subdir
+        "../../",                                        // up 2 in tree
+        "../../common/",                                 // up 2 in tree, "/common/" subdir
+        "../../common/data/",                            // up 2 in tree, "/common/data/" subdir
+        "../../data/",                                   // up 2 in tree, "/data/" subdir
+        "../../src/",                                    // up 2 in tree, "/src/" subdir
+        "../../inc/",                                    // up 2 in tree, "/inc/" subdir
+        "../../sandbox/<executable_name>/data/",         // up 2 in tree, "/sandbox/<executable_name>/" subdir
+        "../../0_Simple/<executable_name>/data/",        // up 2 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../1_Utilities/<executable_name>/data/",     // up 2 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../2_Graphics/<executable_name>/data/",      // up 2 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../3_Imaging/<executable_name>/data/",       // up 2 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../4_Finance/<executable_name>/data/",       // up 2 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../5_Simulations/<executable_name>/data/",   // up 2 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../6_Advanced/<executable_name>/data/",      // up 2 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../8_Android/<executable_name>/data/",       // up 2 in tree, "/8_Android/<executable_name>/" subdir
+        "../../samples/<executable_name>/data/",         // up 2 in tree, "/samples/<executable_name>/" subdir
+        "../../../",                                        // up 3 in tree
+        "../../../src/<executable_name>/",                  // up 3 in tree, "/src/<executable_name>/" subdir
+        "../../../src/<executable_name>/data/",             // up 3 in tree, "/src/<executable_name>/data/" subdir
+        "../../../src/<executable_name>/src/",              // up 3 in tree, "/src/<executable_name>/src/" subdir
+        "../../../src/<executable_name>/inc/",              // up 3 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../sandbox/<executable_name>/",              // up 3 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../sandbox/<executable_name>/data/",         // up 3 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../sandbox/<executable_name>/src/",          // up 3 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../sandbox/<executable_name>/inc/",          // up 3 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../0_Simple/<executable_name>/data/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../1_Utilities/<executable_name>/data/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../2_Graphics/<executable_name>/data/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../3_Imaging/<executable_name>/data/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../4_Finance/<executable_name>/data/",       // up 3 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../5_Simulations/<executable_name>/data/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../6_Advanced/<executable_name>/data/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../8_Android/<executable_name>/data/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../0_Simple/<executable_name>/",        // up 3 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../1_Utilities/<executable_name>/",     // up 3 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../2_Graphics/<executable_name>/",      // up 3 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../3_Imaging/<executable_name>/",       // up 3 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../4_Finance/<executable_name>/",       // up 3 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../5_Simulations/<executable_name>/",   // up 3 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../6_Advanced/<executable_name>/",      // up 3 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../7_CUDALibraries/<executable_name>/", // up 3 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../8_Android/<executable_name>/",       // up 3 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../samples/<executable_name>/data/",         // up 3 in tree, "/samples/<executable_name>/" subdir
+        "../../../common/",                                 // up 3 in tree, "../../../common/" subdir
+        "../../../common/data/",                            // up 3 in tree, "../../../common/data/" subdir
+        "../../../data/",                                   // up 3 in tree, "../../../data/" subdir
+        "../../../../",                                // up 4 in tree
+        "../../../../src/<executable_name>/",          // up 4 in tree, "/src/<executable_name>/" subdir
+        "../../../../src/<executable_name>/data/",     // up 4 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../src/<executable_name>/src/",      // up 4 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../src/<executable_name>/inc/",      // up 4 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../sandbox/<executable_name>/",      // up 4 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../../sandbox/<executable_name>/data/", // up 4 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../../sandbox/<executable_name>/src/",  // up 4 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../sandbox/<executable_name>/inc/",   // up 4 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../0_Simple/<executable_name>/data/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../2_Graphics/<executable_name>/data/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../3_Imaging/<executable_name>/data/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../4_Finance/<executable_name>/data/",    // up 4 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../../5_Simulations/<executable_name>/data/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../6_Advanced/<executable_name>/data/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../8_Android/<executable_name>/data/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../0_Simple/<executable_name>/",     // up 4 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../1_Utilities/<executable_name>/",  // up 4 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../2_Graphics/<executable_name>/",   // up 4 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../3_Imaging/<executable_name>/",    // up 4 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../4_Finance/<executable_name>/",    // up 4 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../../5_Simulations/<executable_name>/",// up 4 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../6_Advanced/<executable_name>/",   // up 4 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../8_Android/<executable_name>/",    // up 4 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../samples/<executable_name>/data/",      // up 4 in tree, "/samples/<executable_name>/" subdir
+        "../../../../common/",                              // up 4 in tree, "../../../common/" subdir
+        "../../../../common/data/",                         // up 4 in tree, "../../../common/data/" subdir
+        "../../../../data/",                                // up 4 in tree, "../../../data/" subdir
+        "../../../../../",                                // up 5 in tree
+        "../../../../../src/<executable_name>/",          // up 5 in tree, "/src/<executable_name>/" subdir
+        "../../../../../src/<executable_name>/data/",     // up 5 in tree, "/src/<executable_name>/data/" subdir
+        "../../../../../src/<executable_name>/src/",      // up 5 in tree, "/src/<executable_name>/src/" subdir
+        "../../../../../src/<executable_name>/inc/",      // up 5 in tree, "/src/<executable_name>/inc/" subdir
+        "../../../../../sandbox/<executable_name>/",      // up 5 in tree, "/sandbox/<executable_name>/" subdir
+        "../../../../../sandbox/<executable_name>/data/", // up 5 in tree, "/sandbox/<executable_name>/data/" subdir
+        "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree, "/sandbox/<executable_name>/src/" subdir
+        "../../../../../sandbox/<executable_name>/inc/",   // up 5 in tree, "/sandbox/<executable_name>/inc/" subdir
+        "../../../../../0_Simple/<executable_name>/data/",     // up 5 in tree, "/0_Simple/<executable_name>/" subdir
+        "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree, "/1_Utilities/<executable_name>/" subdir
+        "../../../../../2_Graphics/<executable_name>/data/",   // up 5 in tree, "/2_Graphics/<executable_name>/" subdir
+        "../../../../../3_Imaging/<executable_name>/data/",    // up 5 in tree, "/3_Imaging/<executable_name>/" subdir
+        "../../../../../4_Finance/<executable_name>/data/",    // up 5 in tree, "/4_Finance/<executable_name>/" subdir
+        "../../../../../5_Simulations/<executable_name>/data/",// up 5 in tree, "/5_Simulations/<executable_name>/" subdir
+        "../../../../../6_Advanced/<executable_name>/data/",   // up 5 in tree, "/6_Advanced/<executable_name>/" subdir
+        "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in tree, "/7_CUDALibraries/<executable_name>/" subdir
+        "../../../../../8_Android/<executable_name>/data/",    // up 5 in tree, "/8_Android/<executable_name>/" subdir
+        "../../../../../samples/<executable_name>/data/",      // up 5 in tree, "/samples/<executable_name>/" subdir
+        "../../../../../common/",                         // up 5 in tree, "../../../common/" subdir
+        "../../../../../common/data/",                    // up 5 in tree, "../../../common/data/" subdir
+    };
+
+    // Extract the executable name
+    std::string executable_name;
+
+    if (executable_path != 0)
+    {
+        executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        // Windows path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('\\');
+        executable_name.erase(0, delimiter_pos + 1);
+
+        if (executable_name.rfind(".exe") != std::string::npos)
+        {
+            // we strip .exe, only if the .exe is found
+            executable_name.resize(executable_name.size() - 4);
+        }
+
+#else
+        // Linux & OSX path delimiter
+        size_t delimiter_pos = executable_name.find_last_of('/');
+        executable_name.erase(0,delimiter_pos+1);
+#endif
+    }
+
+    // Loop over all search paths and return the first hit
+    for (unsigned int i = 0; i < sizeof(searchPath)/sizeof(char *); ++i)
+    {
+        std::string path(searchPath[i]);
+        size_t executable_name_pos = path.find("<executable_name>");
+
+        // If there is executable_name variable in the searchPath
+        // replace it with the value
+        if (executable_name_pos != std::string::npos)
+        {
+            if (executable_path != 0)
+            {
+                path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
+            }
+            else
+            {
+                // Skip this path entry if no executable argument is given
+                continue;
+            }
+        }
+
+#ifdef _DEBUG
+        printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+        // Test if the file exists
+        path.append(filename);
+        FILE *fp;
+        FOPEN(fp, path.c_str(), "rb");
+
+        if (fp != NULL)
+        {
+            fclose(fp);
+            // File found
+            // returning an allocated array here for backwards compatibility reasons
+            char *file_path = (char *) malloc(path.length() + 1);
+            STRCPY(file_path, path.length() + 1, path.c_str());
+            return file_path;
+        }
+
+        if (fp)
+        {
+            fclose(fp);
+        }
+    }
+
+    // File not found
+    return 0;
+}
+
+#endif
--- a/src/algorithms/tracking/libs/cudahelpers/helper_timer.h
+++ b/src/algorithms/tracking/libs/cudahelpers/helper_timer.h
@@ -0,0 +1,499 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// Helper Timing Functions
+#ifndef HELPER_TIMER_H
+#define HELPER_TIMER_H
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include <exception.h>
+
+// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions
+// But rather in a self contained class interface
+class StopWatchInterface
+{
+    public:
+        StopWatchInterface() {};
+        virtual ~StopWatchInterface() {};
+
+    public:
+        //! Start time measurement
+        virtual void start() = 0;
+
+        //! Stop time measurement
+        virtual void stop() = 0;
+
+        //! Reset time counters to zero
+        virtual void reset() = 0;
+
+        //! Time in msec. after start. If the stop watch is still running (i.e. there
+        //! was no call to stop()) then the elapsed time is returned, otherwise the
+        //! time between the last start() and stop call is returned
+        virtual float getTime() = 0;
+
+        //! Mean time to date based on the number of times the stopwatch has been
+        //! _stopped_ (ie finished sessions) and the current total time
+        virtual float getAverageTime() = 0;
+};
+
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface
+{
+    public:
+        //! Constructor, default
+        StopWatchWin() :
+            start_time(),     end_time(),
+            diff_time(0.0f),  total_time(0.0f),
+            running(false), clock_sessions(0), freq(0), freq_set(false)
+        {
+            if (! freq_set)
+            {
+                // helper variable
+                LARGE_INTEGER temp;
+
+                // get the tick frequency from the OS
+                QueryPerformanceFrequency((LARGE_INTEGER *) &temp);
+
+                // convert to type in which it is needed
+                freq = ((double) temp.QuadPart) / 1000.0;
+
+                // rememeber query
+                freq_set = true;
+            }
+        };
+
+        // Destructor
+        ~StopWatchWin() { };
+
+    public:
+        //! Start time measurement
+        inline void start();
+
+        //! Stop time measurement
+        inline void stop();
+
+        //! Reset time counters to zero
+        inline void reset();
+
+        //! Time in msec. after start. If the stop watch is still running (i.e. there
+        //! was no call to stop()) then the elapsed time is returned, otherwise the
+        //! time between the last start() and stop call is returned
+        inline float getTime();
+
+        //! Mean time to date based on the number of times the stopwatch has been
+        //! _stopped_ (ie finished sessions) and the current total time
+        inline float getAverageTime();
+
+    private:
+        // member variables
+
+        //! Start of measurement
+        LARGE_INTEGER  start_time;
+        //! End of measurement
+        LARGE_INTEGER  end_time;
+
+        //! Time difference between the last start and stop
+        float  diff_time;
+
+        //! TOTAL time difference between starts and stops
+        float  total_time;
+
+        //! flag if the stop watch is running
+        bool running;
+
+        //! Number of times clock has been started
+        //! and stopped to allow averaging
+        int clock_sessions;
+
+        //! tick frequency
+        double  freq;
+
+        //! flag if the frequency has been set
+        bool  freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchWin::start()
+{
+    QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
+    running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchWin::stop()
+{
+    QueryPerformanceCounter((LARGE_INTEGER *) &end_time);
+    diff_time = (float)
+                (((double) end_time.QuadPart - (double) start_time.QuadPart) / freq);
+
+    total_time += diff_time;
+    clock_sessions++;
+    running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchWin::reset()
+{
+    diff_time = 0;
+    total_time = 0;
+    clock_sessions = 0;
+
+    if (running)
+    {
+        QueryPerformanceCounter((LARGE_INTEGER *) &start_time);
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchWin::getTime()
+{
+    // Return the TOTAL time to date
+    float retval = total_time;
+
+    if (running)
+    {
+        LARGE_INTEGER temp;
+        QueryPerformanceCounter((LARGE_INTEGER *) &temp);
+        retval += (float)
+                  (((double)(temp.QuadPart - start_time.QuadPart)) / freq);
+    }
+
+    return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchWin::getAverageTime()
+{
+    return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <ctime>
+#include <sys/time.h>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface
+{
+    public:
+        //! Constructor, default
+        StopWatchLinux() :
+            start_time(), diff_time(0.0), total_time(0.0),
+            running(false), clock_sessions(0)
+        { };
+
+        // Destructor
+        virtual ~StopWatchLinux()
+        { };
+
+    public:
+        //! Start time measurement
+        inline void start();
+
+        //! Stop time measurement
+        inline void stop();
+
+        //! Reset time counters to zero
+        inline void reset();
+
+        //! Time in msec. after start. If the stop watch is still running (i.e. there
+        //! was no call to stop()) then the elapsed time is returned, otherwise the
+        //! time between the last start() and stop call is returned
+        inline float getTime();
+
+        //! Mean time to date based on the number of times the stopwatch has been
+        //! _stopped_ (ie finished sessions) and the current total time
+        inline float getAverageTime();
+
+    private:
+
+        // helper functions
+
+        //! Get difference between start time and current time
+        inline float getDiffTime();
+
+    private:
+
+        // member variables
+
+        //! Start of measurement
+        struct timeval  start_time;
+
+        //! Time difference between the last start and stop
+        float  diff_time;
+
+        //! TOTAL time difference between starts and stops
+        float  total_time;
+
+        //! flag if the stop watch is running
+        bool running;
+
+        //! Number of times clock has been started
+        //! and stopped to allow averaging
+        int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchLinux::start()
+{
+    gettimeofday(&start_time, 0);
+    running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchLinux::stop()
+{
+    diff_time = getDiffTime();
+    total_time += diff_time;
+    running = false;
+    clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void
+StopWatchLinux::reset()
+{
+    diff_time = 0;
+    total_time = 0;
+    clock_sessions = 0;
+
+    if (running)
+    {
+        gettimeofday(&start_time, 0);
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchLinux::getTime()
+{
+    // Return the TOTAL time to date
+    float retval = total_time;
+
+    if (running)
+    {
+        retval += getDiffTime();
+    }
+
+    return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchLinux::getAverageTime()
+{
+    return (clock_sessions > 0) ? (total_time/clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float
+StopWatchLinux::getDiffTime()
+{
+    struct timeval t_time;
+    gettimeofday(&t_time, 0);
+
+    // time difference in milli-seconds
+    return (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec)
+                   + (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkCreateTimer(StopWatchInterface **timer_interface)
+{
+    //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    *timer_interface = (StopWatchInterface *)new StopWatchWin();
+#else
+    *timer_interface = (StopWatchInterface *)new StopWatchLinux();
+#endif
+    return (*timer_interface != NULL) ? true : false;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkDeleteTimer(StopWatchInterface **timer_interface)
+{
+    //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        delete *timer_interface;
+        *timer_interface = NULL;
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkStartTimer(StopWatchInterface **timer_interface)
+{
+    //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        (*timer_interface)->start();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkStopTimer(StopWatchInterface **timer_interface)
+{
+    // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        (*timer_interface)->stop();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool
+sdkResetTimer(StopWatchInterface **timer_interface)
+{
+    // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        (*timer_interface)->reset();
+    }
+
+    return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float
+sdkGetAverageTimerValue(StopWatchInterface **timer_interface)
+{
+    //  printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        return (*timer_interface)->getAverageTime();
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float
+sdkGetTimerValue(StopWatchInterface **timer_interface)
+{
+    // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+    if (*timer_interface)
+    {
+        return (*timer_interface)->getTime();
+    }
+    else
+    {
+        return 0.0f;
+    }
+}
+
+#endif // HELPER_TIMER_H
--- a/src/algorithms/tracking/libs/tracking_2nd_PLL_filter.cc
+++ b/src/algorithms/tracking/libs/tracking_2nd_PLL_filter.cc
@@ -94,7 +94,7 @@ Tracking_2nd_PLL_filter::Tracking_2nd_PLL_filter ()
 {
    //--- PLL variables --------------------------------------------------------
    d_pdi_carr = 0.001;// Summation interval for carrier
-    d_plldampingratio = 0.65;
+    d_plldampingratio = 0.7;
 }


--- a/src/core/receiver/CMakeLists.txt
+++ b/src/core/receiver/CMakeLists.txt
@@ -16,6 +16,11 @@
 # along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
 #

+if(ENABLE_CUDA)
+	FIND_PACKAGE(CUDA REQUIRED)
+	add_definitions(-DCUDA_GPU_ACCEL=1)
+endif(ENABLE_CUDA)
+
 set(GNSS_RECEIVER_SOURCES
     control_thread.cc
     control_message_factory.cc
@@ -70,8 +75,9 @@ include_directories(
     ${GFlags_INCLUDE_DIRS}
     ${Boost_INCLUDE_DIRS}
     ${GNURADIO_RUNTIME_INCLUDE_DIRS}
+     ${CUDA_INCLUDE_DIRS}
 )
-
+     
 if(Boost_VERSION LESS 105000)
     add_definitions(-DOLD_BOOST=1)
 endif(Boost_VERSION LESS 105000)
--- a/src/core/receiver/gnss_block_factory.cc
+++ b/src/core/receiver/gnss_block_factory.cc
@@ -77,6 +77,7 @@
 #include "galileo_e1_pcps_quicksync_ambiguous_acquisition.h"
 #include "galileo_e5a_noncoherent_iq_acquisition_caf.h"
 #include "gps_l1_ca_dll_pll_tracking.h"
+#include "gps_l1_ca_dll_pll_tracking_gpu.h"
 #include "gps_l1_ca_dll_pll_optim_tracking.h"
 #include "gps_l1_ca_dll_fll_pll_tracking.h"
 #include "gps_l1_ca_tcp_connector_tracking.h"
@@ -1611,6 +1612,14 @@ std::unique_ptr<TrackingInterface> GNSSBlockFactory::GetTrkBlock(
                    out_streams, queue));
            block = std::move(block_);
        }
+#if CUDA_GPU_ACCEL
+    else if (implementation.compare("GPS_L1_CA_DLL_PLL_Tracking_GPU") == 0)
+        {
+            std::unique_ptr<TrackingInterface> block_(new GpsL1CaDllPllTrackingGPU(configuration.get(), role, in_streams,
+                    out_streams, queue));
+            block = std::move(block_);
+        }
+#endif
    else
        {
            // Log fatal. This causes execution to stop.
--- a/src/main/CMakeLists.txt
+++ b/src/main/CMakeLists.txt
@@ -16,6 +16,7 @@
 # along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
 #

+
 set(GNSS_SDR_OPTIONAL_LIBS "")
 set(GNSS_SDR_OPTIONAL_HEADERS "")

@@ -32,6 +33,12 @@ if(ENABLE_UHD)
    set(GNSS_SDR_OPTIONAL_HEADERS ${GNSS_SDR_OPTIONAL_HEADERS} ${UHD_INCLUDE_DIRS})
 endif(ENABLE_UHD)

+if(ENABLE_CUDA)
+	FIND_PACKAGE(CUDA REQUIRED)
+    add_definitions(-DCUDA_GPU_ACCEL=1)
+endif(ENABLE_CUDA)
+
+
 include_directories(
     ${CMAKE_SOURCE_DIR}/src/core/system_parameters
     ${CMAKE_SOURCE_DIR}/src/core/interfaces
@@ -47,6 +54,7 @@ include_directories(
     ${GNURADIO_RUNTIME_INCLUDE_DIRS}
     ${GNSS_SDR_OPTIONAL_HEADERS}
     ${VOLK_GNSSSDR_INCLUDE_DIRS}
+     ${CUDA_INCLUDE_DIRS}
 )

 add_definitions( -DGNSS_SDR_VERSION="${VERSION}" )
@@ -78,6 +86,7 @@ target_link_libraries(gnss-sdr ${MAC_LIBRARIES}
                               ${GNSS_SDR_OPTIONAL_LIBS}
                               gnss_sp_libs
                               gnss_rx
+                               ${CUDA_LIBRARIES}
                               )


--- a/src/main/main.cc
+++ b/src/main/main.cc
@@ -68,6 +68,11 @@
 #include "sbas_ephemeris.h"
 #include "sbas_time.h"

+#if CUDA_GPU_ACCEL
+	// For the CUDA runtime routines (prefixed with "cuda_")
+	#include <cuda_runtime.h>
+#endif
+

 using google::LogMessage;

@@ -143,6 +148,17 @@ int main(int argc, char** argv)
    google::ParseCommandLineFlags(&argc, &argv, true);
    std::cout << "Initializing GNSS-SDR v" << gnss_sdr_version << " ... Please wait." << std::endl;

+	#if CUDA_GPU_ACCEL
+		// Reset the device
+		// cudaDeviceReset causes the driver to clean up all state. While
+		// not mandatory in normal operation, it is good practice.  It is also
+		// needed to ensure correct operation when the application is being
+		// profiled. Calling cudaDeviceReset causes all profile data to be
+		// flushed before the application exits
+		cudaDeviceReset();
+		 std::cout << "Reset CUDA device done " << std::endl;
+	#endif
+
    if(GOOGLE_STRIP_LOG == 0)
        {
            google::InitGoogleLogging(argv[0]);