Merge branch 'next' of git+ssh://github.com/gnss-sdr/gnss-sdr into next

# Please enter a commit message to explain why this merge is necessary, # especially if it merges an updated upstream into a topic branch. # # Lines starting with '#' will be ignored, and an empty message aborts # the commit.
2025-04-20 09:43:14 +00:00 · 2015-12-23 01:36:05 +01:00 · 2015-12-23 01:36:05 +01:00 · 24982b1213
commit 24982b1213
parent 5bc26b3840 9acc7e7a20
7 changed files with 262 additions and 364 deletions
--- a/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf
+++ b/conf/gnss-sdr_GPS_L1_gr_complex_gpu.conf
@ -17,10 +17,10 @@ ControlThread.wait_for_flowgraph=false
 SignalSource.implementation=File_Signal_Source

 ;#filename: path to file with the captured GNSS signal samples to be processed
-SignalSource.filename=/media/javier/SISTEMA/signals/New York/4msps.dat
+SignalSource.filename=/home/javier/ClionProjects/gnss-sim/build/signal_out.bin

 ;#item_type: Type and resolution for each of the signal samples. Use only gr_complex in this version.
-SignalSource.item_type=gr_complex
+SignalSource.item_type=byte

 ;#sampling_frequency: Original Signal sampling frequency in [Hz] 
 SignalSource.sampling_frequency=4000000
@ -28,12 +28,6 @@ SignalSource.sampling_frequency=4000000
 ;#freq: RF front-end center frequency in [Hz] 
 SignalSource.freq=1575420000

-;#gain: Front-end Gain in [dB] 
-SignalSource.gain=60
-
-;#subdevice: UHD subdevice specification (for USRP1 use A:0 or B:0)
-SignalSource.subdevice=B:0
-
 ;#samples: Number of samples to be processed. Notice that 0 indicates the entire file.
 SignalSource.samples=0

@ -58,12 +52,12 @@ SignalSource.enable_throttle_control=false
 ;#[Pass_Through] disables this block and the [DataTypeAdapter], [InputFilter] and [Resampler] blocks
 ;#[Signal_Conditioner] enables this block. Then you have to configure [DataTypeAdapter], [InputFilter] and [Resampler] blocks
 ;SignalConditioner.implementation=Signal_Conditioner
-SignalConditioner.implementation=Pass_Through
+SignalConditioner.implementation=Signal_Conditioner

 ;######### DATA_TYPE_ADAPTER CONFIG ############
 ;## Changes the type of input data. Please disable it in this version.
 ;#implementation: [Pass_Through] disables this block
-DataTypeAdapter.implementation=Pass_Through
+DataTypeAdapter.implementation=Ibyte_To_Complex

 ;######### INPUT_FILTER CONFIG ############
 ;## Filter the input data. Can be combined with frequency translation for IF signals
@ -165,7 +159,7 @@ Resampler.sample_freq_out=4000000

 ;######### CHANNELS GLOBAL CONFIG ############
 ;#count: Number of available GPS satellite channels.
-Channels_GPS.count=8
+Channels_GPS.count=12
 ;#count: Number of available Galileo satellite channels.
 Channels_Galileo.count=0
 ;#in_acquisition: Number of channels simultaneously acquiring for the whole receiver
@ -210,13 +204,13 @@ Acquisition_GPS.sampled_ms=1
 ;#implementation: Acquisition algorithm selection for this channel: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition]
 Acquisition_GPS.implementation=GPS_L1_CA_PCPS_Acquisition
 ;#threshold: Acquisition threshold
-Acquisition_GPS.threshold=0.005
+Acquisition_GPS.threshold=0.01
 ;#pfa: Acquisition false alarm probability. This option overrides the threshold option. Only use with implementations: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition] 
 ;Acquisition_GPS.pfa=0.01
 ;#doppler_max: Maximum expected Doppler shift [Hz]
-Acquisition_GPS.doppler_max=10000
+Acquisition_GPS.doppler_max=6000
 ;#doppler_max: Doppler step in the grid search [Hz]
-Acquisition_GPS.doppler_step=500
+Acquisition_GPS.doppler_step=100

 ;######### TRACKING GLOBAL CONFIG ############

@ -235,7 +229,7 @@ Tracking_GPS.dump=true
 Tracking_GPS.dump_filename=../data/epl_tracking_ch_

 ;#pll_bw_hz: PLL loop filter bandwidth [Hz]
-Tracking_GPS.pll_bw_hz=55.0;
+Tracking_GPS.pll_bw_hz=15.0;

 ;#dll_bw_hz: DLL loop filter bandwidth [Hz]
 Tracking_GPS.dll_bw_hz=1.5
--- a/conf/gnss-sdr_Hybrid_byte_sim.conf
+++ b/conf/gnss-sdr_Hybrid_byte_sim.conf
@ -198,7 +198,7 @@ Acquisition_1C.sampled_ms=1
 ;#implementation: Acquisition algorithm selection for this channel: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition]
 Acquisition_1C.implementation=GPS_L1_CA_PCPS_Acquisition
 ;#threshold: Acquisition threshold
-Acquisition_1C.threshold=0.035
+Acquisition_1C.threshold=0.045
 ;#pfa: Acquisition false alarm probability. This option overrides the threshold option. Only use with implementations: [GPS_L1_CA_PCPS_Acquisition] or [Galileo_E1_PCPS_Ambiguous_Acquisition] 
 ;Acquisition_1C.pfa=0.01
 ;#doppler_max: Maximum expected Doppler shift [Hz]
@ -233,7 +233,7 @@ Acquisition_1B.doppler_step=125
 ;######### TRACKING GPS CONFIG ############

 ;#implementation: Selected tracking algorithm: [GPS_L1_CA_DLL_PLL_Tracking] or [GPS_L1_CA_DLL_FLL_PLL_Tracking] or [GPS_L1_CA_TCP_CONNECTOR_Tracking] or [Galileo_E1_DLL_PLL_VEML_Tracking]
-Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_Artemisa_Tracking
+Tracking_1C.implementation=GPS_L1_CA_DLL_PLL_C_Aid_Tracking
 ;#item_type: Type and resolution for each of the signal samples. Use only [gr_complex] in this version.
 Tracking_1C.item_type=gr_complex

--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
@ -1,13 +1,8 @@
 /*!
 * \file gps_l1_ca_dll_pll_tracking_gpu_cc.cc
- * \brief Implementation of a code DLL + carrier PLL tracking block, GPU ACCELERATED
+ * \brief Implementation of a code DLL + carrier PLL tracking block GPU ACCELERATED
 * \author Javier Arribas, 2015. jarribas(at)cttc.es
 *
- * Code DLL + carrier PLL according to the algorithms described in:
- * [1] K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
- * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
- * Approach, Birkhauser, 2007
- *
 * -------------------------------------------------------------------------
 *
 * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
@ -40,6 +35,7 @@
 #include <sstream>
 #include <boost/lexical_cast.hpp>
 #include <gnuradio/io_signature.h>
+#include <volk/volk.h>
 #include <glog/logging.h>
 #include "gnss_synchro.h"
 #include "gps_sdr_signal_processing.h"
@ -47,7 +43,6 @@
 #include "lock_detectors.h"
 #include "GPS_L1_CA.h"
 #include "control_message_factory.h"
-#include <volk/volk.h> //volk_alignement
 // includes
 #include <cuda_profiler_api.h>

@ -80,6 +75,7 @@ gps_l1_ca_dll_pll_make_tracking_gpu_cc(
 }


+
 void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::forecast (int noutput_items,
        gr_vector_int &ninput_items_required)
 {
@ -111,10 +107,11 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    d_fs_in = fs_in;
    d_vector_length = vector_length;
    d_dump_filename = dump_filename;
+    d_correlation_length_samples = static_cast<int>(d_vector_length);

    // Initialize tracking  ==========================================
    d_code_loop_filter.set_DLL_BW(dll_bw_hz);
-    d_carrier_loop_filter.set_PLL_BW(pll_bw_hz);
+    d_carrier_loop_filter.set_params(10.0, pll_bw_hz,2);

    //--- DLL variables --------------------------------------------------------
    d_early_late_spc_chips = early_late_space_chips; // Define early-late offset (in chips)
@ -123,32 +120,33 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    cudaSetDeviceFlags(cudaDeviceMapHost);
    //allocate host memory
    //pinned memory mode - use special function to get OS-pinned memory
-    int N_CORRELATORS = 3;
+    d_n_correlator_taps = 3; // Early, Prompt, and Late
    // Get space for a vector with the C/A code replica sampled 1x/chip
-    cudaHostAlloc((void**)&d_ca_code, (GPS_L1_CA_CODE_LENGTH_CHIPS* sizeof(gr_complex)), cudaHostAllocMapped || cudaHostAllocWriteCombined);
+    cudaHostAlloc((void**)&d_ca_code, (static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS)* sizeof(gr_complex)), cudaHostAllocMapped || cudaHostAllocWriteCombined);
    // Get space for the resampled early / prompt / late local replicas
-    cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped || cudaHostAllocWriteCombined);
+    cudaHostAlloc((void**)&d_local_code_shift_chips, d_n_correlator_taps * sizeof(float),  cudaHostAllocMapped || cudaHostAllocWriteCombined);
    cudaHostAlloc((void**)&in_gpu, 2 * d_vector_length * sizeof(gr_complex), cudaHostAllocMapped || cudaHostAllocWriteCombined);
    // correlator outputs (scalar)
-    cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS, cudaHostAllocMapped ||  cudaHostAllocWriteCombined );
+    cudaHostAlloc((void**)&d_correlator_outs ,sizeof(gr_complex)*d_n_correlator_taps, cudaHostAllocMapped ||  cudaHostAllocWriteCombined );
+
+    // Set TAPs delay values [chips]
+    d_local_code_shift_chips[0] = - d_early_late_spc_chips;
+    d_local_code_shift_chips[1] = 0.0;
+    d_local_code_shift_chips[2] = d_early_late_spc_chips;

-    //map to EPL pointers
-    d_Early = &d_corr_outs_gpu[0];
-    d_Prompt =  &d_corr_outs_gpu[1];
-    d_Late = &d_corr_outs_gpu[2];

    //--- Perform initializations ------------------------------
    multicorrelator_gpu = new cuda_multicorrelator();
    //local code resampler on GPU
-    multicorrelator_gpu->init_cuda_integrated_resampler(2 * d_vector_length, GPS_L1_CA_CODE_LENGTH_CHIPS, 3);
-    multicorrelator_gpu->set_input_output_vectors(d_corr_outs_gpu, in_gpu);
+    multicorrelator_gpu->init_cuda_integrated_resampler(2 * d_vector_length, GPS_L1_CA_CODE_LENGTH_CHIPS, d_n_correlator_taps);
+    multicorrelator_gpu->set_input_output_vectors(d_correlator_outs, in_gpu);

    // define initial code frequency basis of NCO
    d_code_freq_chips = GPS_L1_CA_CODE_RATE_HZ;
    // define residual code phase (in chips)
    d_rem_code_phase_samples = 0.0;
    // define residual carrier phase
-    d_rem_carr_phase_rad = 0.0;
+    d_rem_carrier_phase_rad = 0.0;

    // sample synchronization
    d_sample_counter = 0;
@ -159,8 +157,6 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    d_pull_in = false;
    d_last_seg = 0;

-    d_current_prn_length_samples = static_cast<int>(d_vector_length);
-
    // CN0 estimation and lock detector buffers
    d_cn0_estimation_counter = 0;
    d_Prompt_buffer = new gr_complex[CN0_ESTIMATION_SAMPLES];
@ -172,8 +168,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    systemName["G"] = std::string("GPS");
    systemName["S"] = std::string("SBAS");

-
-    set_relative_rate(1.0/((double)d_vector_length*2));
+    set_relative_rate(1.0 / (static_cast<double>(d_vector_length) * 2.0));

    d_channel_internal_queue = 0;
    d_acquisition_gnss_synchro = 0;
@ -181,9 +176,13 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
    d_acq_code_phase_samples = 0.0;
    d_acq_carrier_doppler_hz = 0.0;
    d_carrier_doppler_hz = 0.0;
-    d_acc_carrier_phase_rad = 0.0;
+    d_acc_carrier_phase_cycles = 0.0;
    d_code_phase_samples = 0.0;
-    d_acc_code_phase_secs = 0.0;
+
+    d_pll_to_dll_assist_secs_Ti = 0.0;
+    d_rem_code_phase_chips = 0.0;
+    d_code_phase_step_chips = 0.0;
+    d_carrier_phase_step_rad = 0.0;
    //set_min_output_buffer((long int)300);
 }

@ -195,33 +194,34 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking()
     */
    d_acq_code_phase_samples = d_acquisition_gnss_synchro->Acq_delay_samples;
    d_acq_carrier_doppler_hz = d_acquisition_gnss_synchro->Acq_doppler_hz;
-    d_acq_sample_stamp =  d_acquisition_gnss_synchro->Acq_samplestamp_samples;
+    d_acq_sample_stamp = d_acquisition_gnss_synchro->Acq_samplestamp_samples;

    long int acq_trk_diff_samples;
-    float acq_trk_diff_seconds;
+    double acq_trk_diff_seconds;
    acq_trk_diff_samples = static_cast<long int>(d_sample_counter) - static_cast<long int>(d_acq_sample_stamp);//-d_vector_length;
    DLOG(INFO) << "Number of samples between Acquisition and Tracking =" << acq_trk_diff_samples;
-    acq_trk_diff_seconds = static_cast<float>(acq_trk_diff_samples) / static_cast<float>(d_fs_in);
+    acq_trk_diff_seconds = static_cast<double>(acq_trk_diff_samples) / static_cast<double>(d_fs_in);
    //doppler effect
    // Fd=(C/(C+Vr))*F
-    float radial_velocity = (GPS_L1_FREQ_HZ + d_acq_carrier_doppler_hz) / GPS_L1_FREQ_HZ;
+    double radial_velocity = (GPS_L1_FREQ_HZ + d_acq_carrier_doppler_hz) / GPS_L1_FREQ_HZ;
    // new chip and prn sequence periods based on acq Doppler
-    float T_chip_mod_seconds;
-    float T_prn_mod_seconds;
-    float T_prn_mod_samples;
+    double T_chip_mod_seconds;
+    double T_prn_mod_seconds;
+    double T_prn_mod_samples;
    d_code_freq_chips = radial_velocity * GPS_L1_CA_CODE_RATE_HZ;
+    d_code_phase_step_chips = static_cast<double>(d_code_freq_chips) / static_cast<double>(d_fs_in);
    T_chip_mod_seconds = 1/d_code_freq_chips;
    T_prn_mod_seconds = T_chip_mod_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS;
-    T_prn_mod_samples = T_prn_mod_seconds * static_cast<float>(d_fs_in);
+    T_prn_mod_samples = T_prn_mod_seconds * static_cast<double>(d_fs_in);

-    d_current_prn_length_samples = round(T_prn_mod_samples);
+    d_correlation_length_samples = round(T_prn_mod_samples);

-    float T_prn_true_seconds = GPS_L1_CA_CODE_LENGTH_CHIPS / GPS_L1_CA_CODE_RATE_HZ;
-    float T_prn_true_samples = T_prn_true_seconds * static_cast<float>(d_fs_in);
-    float T_prn_diff_seconds=  T_prn_true_seconds - T_prn_mod_seconds;
-    float N_prn_diff = acq_trk_diff_seconds / T_prn_true_seconds;
-    float corrected_acq_phase_samples, delay_correction_samples;
-    corrected_acq_phase_samples = fmod((d_acq_code_phase_samples + T_prn_diff_seconds * N_prn_diff * static_cast<float>(d_fs_in)), T_prn_true_samples);
+    double T_prn_true_seconds = GPS_L1_CA_CODE_LENGTH_CHIPS / GPS_L1_CA_CODE_RATE_HZ;
+    double T_prn_true_samples = T_prn_true_seconds * static_cast<double>(d_fs_in);
+    double T_prn_diff_seconds = T_prn_true_seconds - T_prn_mod_seconds;
+    double N_prn_diff = acq_trk_diff_seconds / T_prn_true_seconds;
+    double corrected_acq_phase_samples, delay_correction_samples;
+    corrected_acq_phase_samples = fmod((d_acq_code_phase_samples + T_prn_diff_seconds * N_prn_diff * static_cast<double>(d_fs_in)), T_prn_true_samples);
    if (corrected_acq_phase_samples < 0)
        {
            corrected_acq_phase_samples = T_prn_mod_samples + corrected_acq_phase_samples;
@ -232,25 +232,28 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking()

    d_carrier_doppler_hz = d_acq_carrier_doppler_hz;

+    d_carrier_phase_step_rad = GPS_TWO_PI * d_carrier_doppler_hz / static_cast<double>(d_fs_in);
+
    // DLL/PLL filter initialization
-    d_carrier_loop_filter.initialize(); // initialize the carrier filter
+    d_carrier_loop_filter.initialize(d_acq_carrier_doppler_hz); //The carrier loop filter implements the Doppler accumulator
    d_code_loop_filter.initialize();    // initialize the code filter

    // generate local reference ALWAYS starting at chip 1 (1 sample per chip)
    gps_l1_ca_code_gen_complex(d_ca_code, d_acquisition_gnss_synchro->PRN, 0);

-    d_local_code_shift_chips[0] = - d_early_late_spc_chips;
-    d_local_code_shift_chips[1] = 0.0;
-    d_local_code_shift_chips[2] = d_early_late_spc_chips;
+    multicorrelator_gpu->set_local_code_and_taps(static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS), d_ca_code, d_local_code_shift_chips, d_n_correlator_taps);

-    multicorrelator_gpu->set_local_code_and_taps(GPS_L1_CA_CODE_LENGTH_CHIPS, d_ca_code, d_local_code_shift_chips, 3);
+    for (int n = 0; n < d_n_correlator_taps; n++)
+        {
+            d_correlator_outs[n] = gr_complex(0,0);
+        }

    d_carrier_lock_fail_counter = 0;
-    d_rem_code_phase_samples = 0;
-    d_rem_carr_phase_rad = 0;
-    d_acc_carrier_phase_rad = 0;
-    d_acc_code_phase_secs = 0;
-
+    d_rem_code_phase_samples = 0.0;
+    d_rem_carrier_phase_rad = 0.0;
+    d_rem_code_phase_chips = 0.0;
+    d_acc_carrier_phase_cycles = 0.0;
+    d_pll_to_dll_assist_secs_Ti = 0.0;
    d_code_phase_samples = d_acq_code_phase_samples;

    std::string sys_ = &d_acquisition_gnss_synchro->System;
@ -273,14 +276,15 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking()

 Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
 {
+
    d_dump_file.close();
    cudaFreeHost(in_gpu);
-    cudaFreeHost(d_corr_outs_gpu);
+    cudaFreeHost(d_correlator_outs);
    cudaFreeHost(d_local_code_shift_chips);
    cudaFreeHost(d_ca_code);
    multicorrelator_gpu->free_cuda();
-    delete(multicorrelator_gpu);
    delete[] d_Prompt_buffer;
+    delete(multicorrelator_gpu);
 }


@ -288,29 +292,34 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
 int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vector_int &ninput_items,
        gr_vector_const_void_star &input_items, gr_vector_void_star &output_items)
 {
-    // process vars
-    float carr_error_hz = 0.0;
-    float carr_error_filt_hz = 0.0;
-    float code_error_chips = 0.0;
-    float code_error_filt_chips = 0.0;
-
    // Block input data and block output stream pointers
-    const gr_complex* in = (gr_complex*) input_items[0];
+    const gr_complex* in = (gr_complex*) input_items[0]; //PRN start block alignment
    Gnss_Synchro **out = (Gnss_Synchro **) &output_items[0];

    // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder
    Gnss_Synchro current_synchro_data = Gnss_Synchro();

+    // process vars
+    double code_error_chips_Ti = 0.0;
+    double code_error_filt_chips = 0.0;
+    double code_error_filt_secs_Ti = 0.0;
+    double CURRENT_INTEGRATION_TIME_S;
+    double CORRECTED_INTEGRATION_TIME_S;
+    double dll_code_error_secs_Ti = 0.0;
+    double carr_phase_error_secs_Ti = 0.0;
+    double old_d_rem_code_phase_samples;
    if (d_enable_tracking == true)
        {
            // Receiver signal alignment
            if (d_pull_in == true)
                {
                    int samples_offset;
+                    double acq_trk_shif_correction_samples;
                    int acq_to_trk_delay_samples;
                    acq_to_trk_delay_samples = d_sample_counter - d_acq_sample_stamp;
-                    samples_offset = round(d_acq_code_phase_samples)+d_current_prn_length_samples - acq_to_trk_delay_samples%d_current_prn_length_samples;
-                    d_sample_counter = d_sample_counter + samples_offset; //count for the processed samples
+                    acq_trk_shif_correction_samples = d_correlation_length_samples - fmod(static_cast<double>(acq_to_trk_delay_samples), static_cast<double>(d_correlation_length_samples));
+                    samples_offset = round(d_acq_code_phase_samples + acq_trk_shif_correction_samples);
+                    d_sample_counter += samples_offset; //count for the processed samples
                    d_pull_in = false;
                    // Fill the acquisition data
                    current_synchro_data = *d_acquisition_gnss_synchro;
@ -322,42 +331,44 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
            // Fill the acquisition data
            current_synchro_data = *d_acquisition_gnss_synchro;

-            // UPDATE NCO COMMAND
-            float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
+            // ################# CARRIER WIPEOFF AND CORRELATORS ##############################
+            // perform carrier wipe-off and compute Early, Prompt and Late correlation

-        	//code resampler on GPU (new)
-            float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in);
-            float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in);
-
-            memcpy(in_gpu, in, sizeof(gr_complex) * d_current_prn_length_samples);
+            memcpy(in_gpu, in, sizeof(gr_complex) * d_correlation_length_samples);
            cudaProfilerStart();
-            multicorrelator_gpu->Carrier_wipeoff_multicorrelator_resampler_cuda(d_rem_carr_phase_rad, phase_step_rad, code_phase_step_chips, rem_code_phase_chips, d_current_prn_length_samples, 3);
+            multicorrelator_gpu->Carrier_wipeoff_multicorrelator_resampler_cuda( static_cast<float>(d_rem_carrier_phase_rad),
+            		static_cast<float>(d_carrier_phase_step_rad),
+            		static_cast<float>(d_code_phase_step_chips),
+            		static_cast<float>(d_rem_code_phase_chips),
+            		d_correlation_length_samples, d_n_correlator_taps);
            cudaProfilerStop();
+            //std::cout<<"c_out[0]="<<d_correlator_outs[0]<<"c_out[1]="<<d_correlator_outs[1]<<"c_out[2]="<<d_correlator_outs[2]<<std::endl;
+
+            // UPDATE INTEGRATION TIME
+            CURRENT_INTEGRATION_TIME_S = static_cast<double>(d_correlation_length_samples) / static_cast<double>(d_fs_in);

            // ################## PLL ##########################################################
-            // PLL discriminator
-            carr_error_hz = pll_cloop_two_quadrant_atan(*d_Prompt) / static_cast<float>(GPS_TWO_PI);
+            // Update PLL discriminator [rads/Ti -> Secs/Ti]
+            carr_phase_error_secs_Ti = pll_cloop_two_quadrant_atan(d_correlator_outs[1]) / GPS_TWO_PI; //prompt output
            // Carrier discriminator filter
-            carr_error_filt_hz = d_carrier_loop_filter.get_carrier_nco(carr_error_hz);
-            // New carrier Doppler frequency estimation
-            d_carrier_doppler_hz = d_acq_carrier_doppler_hz + carr_error_filt_hz;
-            // New code Doppler frequency estimation
+            // NOTICE: The carrier loop filter includes the Carrier Doppler accumulator, as described in Kaplan
+            //d_carrier_doppler_hz = d_acq_carrier_doppler_hz + carr_phase_error_filt_secs_ti/INTEGRATION_TIME;
+            // Input [s/Ti] -> output [Hz]
+            d_carrier_doppler_hz = d_carrier_loop_filter.get_carrier_error(0.0, carr_phase_error_secs_Ti, CURRENT_INTEGRATION_TIME_S);
+            // PLL to DLL assistance [Secs/Ti]
+            d_pll_to_dll_assist_secs_Ti = (d_carrier_doppler_hz * CURRENT_INTEGRATION_TIME_S) / GPS_L1_FREQ_HZ;
+            // code Doppler frequency update
            d_code_freq_chips = GPS_L1_CA_CODE_RATE_HZ + ((d_carrier_doppler_hz * GPS_L1_CA_CODE_RATE_HZ) / GPS_L1_FREQ_HZ);
-            //carrier phase accumulator for (K) doppler estimation
-            d_acc_carrier_phase_rad -= GPS_TWO_PI * d_carrier_doppler_hz * GPS_L1_CA_CODE_PERIOD;
-            //remanent carrier phase to prevent overflow in the code NCO
-            d_rem_carr_phase_rad = d_rem_carr_phase_rad + GPS_TWO_PI * d_carrier_doppler_hz * GPS_L1_CA_CODE_PERIOD;
-            d_rem_carr_phase_rad = fmod(d_rem_carr_phase_rad, GPS_TWO_PI);

            // ################## DLL ##########################################################
            // DLL discriminator
-            code_error_chips = dll_nc_e_minus_l_normalized(*d_Early, *d_Late); //[chips/Ti]
+            code_error_chips_Ti = dll_nc_e_minus_l_normalized(d_correlator_outs[0], d_correlator_outs[2]); //[chips/Ti] //early and late
            // Code discriminator filter
-            code_error_filt_chips = d_code_loop_filter.get_code_nco(code_error_chips); //[chips/second]
-            //Code phase accumulator
-            float code_error_filt_secs;
-            code_error_filt_secs = (GPS_L1_CA_CODE_PERIOD * code_error_filt_chips) / GPS_L1_CA_CODE_RATE_HZ; //[seconds]
-            d_acc_code_phase_secs = d_acc_code_phase_secs + code_error_filt_secs;
+            code_error_filt_chips = d_code_loop_filter.get_code_nco(code_error_chips_Ti); //input [chips/Ti] -> output [chips/second]
+            code_error_filt_secs_Ti = code_error_filt_chips*CURRENT_INTEGRATION_TIME_S/d_code_freq_chips; // [s/Ti]
+            // DLL code error estimation [s/Ti]
+            // TODO: PLL carrier aid to DLL is disabled. Re-enable it and measure performance
+            dll_code_error_secs_Ti = - code_error_filt_secs_Ti + d_pll_to_dll_assist_secs_Ti;

            // ################## CARRIER AND CODE NCO BUFFER ALIGNEMENT #######################
            // keep alignment parameters for the next input buffer
@ -366,17 +377,38 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
            double T_prn_samples;
            double K_blk_samples;
            // Compute the next buffer length based in the new period of the PRN sequence and the code phase error estimation
-            T_chip_seconds = 1 / static_cast<double>(d_code_freq_chips);
+            T_chip_seconds = 1 / d_code_freq_chips;
            T_prn_seconds = T_chip_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS;
            T_prn_samples = T_prn_seconds * static_cast<double>(d_fs_in);
-            K_blk_samples = T_prn_samples + d_rem_code_phase_samples + static_cast<double>(code_error_filt_secs) * static_cast<double>(d_fs_in);
-            //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample
+            K_blk_samples = T_prn_samples + d_rem_code_phase_samples - dll_code_error_secs_Ti * static_cast<double>(d_fs_in);

-            // ####### CN0 ESTIMATION AND LOCK DETECTORS ######
+            d_correlation_length_samples = round(K_blk_samples); //round to a discrete samples
+            old_d_rem_code_phase_samples=d_rem_code_phase_samples;
+            d_rem_code_phase_samples = K_blk_samples - static_cast<double>(d_correlation_length_samples); //rounding error < 1 sample
+
+            // UPDATE REMNANT CARRIER PHASE
+            CORRECTED_INTEGRATION_TIME_S=(static_cast<double>(d_correlation_length_samples)/static_cast<double>(d_fs_in));
+            //remnant carrier phase [rad]
+            d_rem_carrier_phase_rad = fmod(d_rem_carrier_phase_rad + GPS_TWO_PI * d_carrier_doppler_hz * CORRECTED_INTEGRATION_TIME_S, GPS_TWO_PI);
+            // UPDATE CARRIER PHASE ACCUULATOR
+            //carrier phase accumulator prior to update the PLL estimators (accumulated carrier in this loop depends on the old estimations!)
+            d_acc_carrier_phase_cycles -= d_carrier_doppler_hz * CORRECTED_INTEGRATION_TIME_S;
+
+            //################### PLL COMMANDS #################################################
+            //carrier phase step (NCO phase increment per sample) [rads/sample]
+            d_carrier_phase_step_rad = GPS_TWO_PI * d_carrier_doppler_hz / static_cast<double>(d_fs_in);
+
+            //################### DLL COMMANDS #################################################
+            //code phase step (Code resampler phase increment per sample) [chips/sample]
+            d_code_phase_step_chips = d_code_freq_chips / static_cast<double>(d_fs_in);
+            //remnant code phase [chips]
+            d_rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / static_cast<double>(d_fs_in));
+
+            // ####### CN0 ESTIMATION AND LOCK DETECTORS #######################################
            if (d_cn0_estimation_counter < CN0_ESTIMATION_SAMPLES)
                {
                    // fill buffer with prompt correlator output values
-                    d_Prompt_buffer[d_cn0_estimation_counter] = *d_Prompt;
+                    d_Prompt_buffer[d_cn0_estimation_counter] = d_correlator_outs[1]; //prompt
                    d_cn0_estimation_counter++;
                }
            else
@ -408,26 +440,17 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
                            d_enable_tracking = false; // TODO: check if disabling tracking is consistent with the channel state machine
                        }
                }
+
            // ########### Output the tracking data to navigation and PVT ##########
-            current_synchro_data.Prompt_I = static_cast<double>((*d_Prompt).real());
-            current_synchro_data.Prompt_Q = static_cast<double>((*d_Prompt).imag());
-
-            // Tracking_timestamp_secs is aligned with the NEXT PRN start sample (Hybridization problem!)
-            //compute remnant code phase samples BEFORE the Tracking timestamp
-            //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample
-            //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter + (double)d_current_prn_length_samples + (double)d_rem_code_phase_samples)/static_cast<double>(d_fs_in);
-
-            // Tracking_timestamp_secs is aligned with the CURRENT PRN start sample (Hybridization OK!, but some glitches??)
-            current_synchro_data.Tracking_timestamp_secs = (static_cast<double>(d_sample_counter) + static_cast<double>(d_rem_code_phase_samples)) / static_cast<double>(d_fs_in);
-            //compute remnant code phase samples AFTER the Tracking timestamp
-            d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample
-
-            //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter)/static_cast<double>(d_fs_in);
+            current_synchro_data.Prompt_I = static_cast<double>((d_correlator_outs[1]).real());
+            current_synchro_data.Prompt_Q = static_cast<double>((d_correlator_outs[1]).imag());
+            // Tracking_timestamp_secs is aligned with the CURRENT PRN start sample (Hybridization OK!)
+            current_synchro_data.Tracking_timestamp_secs = (static_cast<double>(d_sample_counter) + old_d_rem_code_phase_samples) / static_cast<double>(d_fs_in);
            // This tracking block aligns the Tracking_timestamp_secs with the start sample of the PRN, thus, Code_phase_secs=0
            current_synchro_data.Code_phase_secs = 0;
-            current_synchro_data.Carrier_phase_rads = static_cast<double>(d_acc_carrier_phase_rad);
-            current_synchro_data.Carrier_Doppler_hz = static_cast<double>(d_carrier_doppler_hz);
-            current_synchro_data.CN0_dB_hz = static_cast<double>(d_CN0_SNV_dB_Hz);
+            current_synchro_data.Carrier_phase_rads = GPS_TWO_PI * d_acc_carrier_phase_cycles;
+            current_synchro_data.Carrier_Doppler_hz = d_carrier_doppler_hz;
+            current_synchro_data.CN0_dB_hz = d_CN0_SNV_dB_Hz;
            current_synchro_data.Flag_valid_pseudorange = false;
            *out[0] = current_synchro_data;

@ -443,7 +466,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
                            d_last_seg = floor(d_sample_counter / d_fs_in);
                            std::cout << "Current input signal time = " << d_last_seg << " [s]" << std::endl;
                            DLOG(INFO) << "GPS L1 C/A Tracking CH " << d_channel <<  ": Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN)
-                                      << ", CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]" << std::endl;
+                                              << ", CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]" << std::endl;
                            //if (d_last_seg==5) d_carrier_lock_fail_counter=500; //DEBUG: force unlock!
                        }
                }
@ -453,7 +476,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
                        {
                            d_last_seg = floor(d_sample_counter / d_fs_in);
                            DLOG(INFO) << "Tracking CH " << d_channel <<  ": Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN)
-                                       << ", CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]";
+                                               << ", CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]";
                        }
                }
        }
@ -476,9 +499,10 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
                            std::cout << tmp_str_stream.rdbuf() << std::flush;
                        }
                }
-            *d_Early = gr_complex(0,0);
-            *d_Prompt = gr_complex(0,0);
-            *d_Late = gr_complex(0,0);
+            for (int n = 0; n < d_n_correlator_taps; n++)
+                {
+                    d_correlator_outs[n] = gr_complex(0,0);
+                }

            current_synchro_data.System = {'G'};
            current_synchro_data.Flag_valid_pseudorange = false;
@ -491,13 +515,12 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
            float prompt_I;
            float prompt_Q;
            float tmp_E, tmp_P, tmp_L;
-            float tmp_float;
            double tmp_double;
-            prompt_I = (*d_Prompt).real();
-            prompt_Q = (*d_Prompt).imag();
-            tmp_E = std::abs<float>(*d_Early);
-            tmp_P = std::abs<float>(*d_Prompt);
-            tmp_L = std::abs<float>(*d_Late);
+            prompt_I = d_correlator_outs[1].real();
+            prompt_Q = d_correlator_outs[1].imag();
+            tmp_E = std::abs<float>(d_correlator_outs[0]);
+            tmp_P = std::abs<float>(d_correlator_outs[1]);
+            tmp_L = std::abs<float>(d_correlator_outs[2]);
            try
            {
                    // EPR
@ -511,39 +534,39 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
                    //tmp_float=(float)d_sample_counter;
                    d_dump_file.write(reinterpret_cast<char*>(&d_sample_counter), sizeof(unsigned long int));
                    // accumulated carrier phase
-                    d_dump_file.write(reinterpret_cast<char*>(&d_acc_carrier_phase_rad), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&d_acc_carrier_phase_cycles), sizeof(double));

                    // carrier and code frequency
-                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(float));
-                    tmp_float=d_code_freq_chips;
-                    d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(double));
+                    d_dump_file.write(reinterpret_cast<char*>(&d_code_freq_chips), sizeof(double));

                    //PLL commands
-                    d_dump_file.write(reinterpret_cast<char*>(&carr_error_hz), sizeof(float));
-                    d_dump_file.write(reinterpret_cast<char*>(&carr_error_filt_hz), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&carr_phase_error_secs_Ti), sizeof(double));
+                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(double));

                    //DLL commands
-                    d_dump_file.write(reinterpret_cast<char*>(&code_error_chips), sizeof(float));
-                    d_dump_file.write(reinterpret_cast<char*>(&code_error_filt_chips), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&code_error_chips_Ti), sizeof(double));
+                    d_dump_file.write(reinterpret_cast<char*>(&code_error_filt_chips), sizeof(double));

                    // CN0 and carrier lock test
-                    d_dump_file.write(reinterpret_cast<char*>(&d_CN0_SNV_dB_Hz), sizeof(float));
-                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_lock_test), sizeof(float));
+                    d_dump_file.write(reinterpret_cast<char*>(&d_CN0_SNV_dB_Hz), sizeof(double));
+                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_lock_test), sizeof(double));

                    // AUX vars (for debug purposes)
-                    tmp_float = d_rem_code_phase_samples;
-                    d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float));
-                    tmp_double = static_cast<double>(d_sample_counter + d_current_prn_length_samples);
+                    tmp_double = d_rem_code_phase_samples;
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_double), sizeof(double));
+                    tmp_double = static_cast<double>(d_sample_counter + d_correlation_length_samples);
                    d_dump_file.write(reinterpret_cast<char*>(&tmp_double), sizeof(double));
            }
-            catch (std::ifstream::failure e)
+            catch (const std::ifstream::failure* e)
            {
-                    LOG(WARNING) << "Exception writing trk dump file " << e.what();
+                    LOG(WARNING) << "Exception writing trk dump file " << e->what();
            }
        }

-    consume_each(d_current_prn_length_samples); // this is necessary in gr::block derivates
-    d_sample_counter += d_current_prn_length_samples; //count for the processed samples
+    consume_each(d_correlation_length_samples); // this is necessary in gr::block derivates
+    d_sample_counter += d_correlation_length_samples; //count for the processed samples
+
    if((noutput_items == 0) || (ninput_items[0] == 0))
        {
            LOG(WARNING) << "noutput_items = 0";
@ -551,8 +574,6 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
    return 1; //output tracking result ALWAYS even in the case of d_enable_tracking==false
 }

-
-
 void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::set_channel(unsigned int channel)
 {
    d_channel = channel;
@ -570,22 +591,19 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::set_channel(unsigned int channel)
                            d_dump_file.open(d_dump_filename.c_str(), std::ios::out | std::ios::binary);
                            LOG(INFO) << "Tracking dump enabled on channel " << d_channel << " Log file: " << d_dump_filename.c_str() << std::endl;
                    }
-                    catch (std::ifstream::failure e)
+                    catch (const std::ifstream::failure* e)
                    {
-                            LOG(WARNING) << "channel " << d_channel << " Exception opening trk dump file " << e.what() << std::endl;
+                            LOG(WARNING) << "channel " << d_channel << " Exception opening trk dump file " << e->what() << std::endl;
                    }
                }
        }
 }

-
-
 void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::set_channel_queue(concurrent_queue<int> *channel_internal_queue)
 {
    d_channel_internal_queue = channel_internal_queue;
 }

-
 void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro)
 {
    d_acquisition_gnss_synchro = p_gnss_synchro;
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
@ -48,7 +48,7 @@
 #include "gps_sdr_signal_processing.h"
 #include "gnss_synchro.h"
 #include "tracking_2nd_DLL_filter.h"
-#include "tracking_2nd_PLL_filter.h"
+#include "tracking_FLL_PLL_filter.h"
 #include "cuda_multicorrelator.h"

 class Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc;
@ -124,12 +124,13 @@ private:
    long d_fs_in;

    double d_early_late_spc_chips;
+    int d_n_correlator_taps;


    //GPU HOST PINNED MEMORY IN/OUT VECTORS
    gr_complex* in_gpu;
    float* d_local_code_shift_chips;
-    gr_complex* d_corr_outs_gpu;
+    gr_complex* d_correlator_outs;
    cuda_multicorrelator *multicorrelator_gpu;
    gr_complex* d_ca_code;

@ -140,25 +141,28 @@ private:

    // remaining code phase and carrier phase between tracking loops
    double d_rem_code_phase_samples;
-    float d_rem_carr_phase_rad;
+    double d_rem_code_phase_chips;
+    double d_rem_carrier_phase_rad;

    // PLL and DLL filter library
    Tracking_2nd_DLL_filter d_code_loop_filter;
-    Tracking_2nd_PLL_filter d_carrier_loop_filter;
+    Tracking_FLL_PLL_filter d_carrier_loop_filter;

    // acquisition
-    float d_acq_code_phase_samples;
-    float d_acq_carrier_doppler_hz;
+    double d_acq_code_phase_samples;
+    double d_acq_carrier_doppler_hz;

    // tracking vars
    double d_code_freq_chips;
-    float d_carrier_doppler_hz;
-    float d_acc_carrier_phase_rad;
-    float d_code_phase_samples;
-    float d_acc_code_phase_secs;
+    double d_code_phase_step_chips;
+    double d_carrier_doppler_hz;
+    double d_carrier_phase_step_rad;
+    double d_acc_carrier_phase_cycles;
+    double d_code_phase_samples;
+    double d_pll_to_dll_assist_secs_Ti;

-    //PRN period in samples
-    int d_current_prn_length_samples;
+    //Integration period in samples
+    int d_correlation_length_samples;

    //processing samples counters
    unsigned long int d_sample_counter;
@ -167,9 +171,9 @@ private:
    // CN0 estimation and lock detector
    int d_cn0_estimation_counter;
    gr_complex* d_Prompt_buffer;
-    float d_carrier_lock_test;
-    float d_CN0_SNV_dB_Hz;
-    float d_carrier_lock_threshold;
+    double d_carrier_lock_test;
+    double d_CN0_SNV_dB_Hz;
+    double d_carrier_lock_threshold;
    int d_carrier_lock_fail_counter;

    // control vars
--- a/src/algorithms/tracking/libs/cpu_multicorrelator.cc
+++ b/src/algorithms/tracking/libs/cpu_multicorrelator.cc
@ -125,11 +125,12 @@ bool cpu_multicorrelator::Carrier_wipeoff_multicorrelator_resampler(
        float code_phase_step_chips,
        int signal_length_samples)
 {
+    //update_local_carrier(signal_length_samples, rem_carrier_phase_in_rad, phase_step_rad); //replaced by VOLK phase rotator
+    //volk_32fc_x2_multiply_32fc(d_sig_doppler_wiped, d_sig_in, d_nco_in, signal_length_samples); //replaced by VOLK phase rotator

-    update_local_carrier(signal_length_samples, rem_carrier_phase_in_rad, phase_step_rad);
+    lv_32fc_t phase_offset_as_complex[1]= lv_cmake(std::cos(rem_carrier_phase_in_rad),-std::sin(rem_carrier_phase_in_rad));
+    volk_32fc_s32fc_x2_rotator_32fc(d_sig_doppler_wiped, d_sig_in, std::exp(lv_32fc_t(0, -phase_step_rad)),phase_offset_as_complex, signal_length_samples);
    update_local_code(signal_length_samples,rem_code_phase_chips, code_phase_step_chips);
-
-    volk_32fc_x2_multiply_32fc(d_sig_doppler_wiped, d_sig_in, d_nco_in, signal_length_samples);
    for (int current_correlator_tap = 0; current_correlator_tap < d_n_correlators; current_correlator_tap++)
        {
            volk_32fc_x2_dot_prod_32fc(&d_corr_out[current_correlator_tap], d_sig_doppler_wiped, d_local_codes_resampled[current_correlator_tap], signal_length_samples);
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@ -41,110 +41,13 @@

 #define ACCUM_N 128

-__global__ void scalarProdGPUCPXxN_shifts_chips(
-    GPU_Complex *d_corr_out,
-    GPU_Complex *d_sig_in,
-    GPU_Complex *d_local_code_in,
-    float *d_shifts_chips,
-    float code_length_chips,
-    float code_phase_step_chips,
-    float rem_code_phase_chips,
-    int vectorN,
-    int elementN
-)
-{
-    //Accumulators cache
-    __shared__ GPU_Complex accumResult[ACCUM_N];
-
-    ////////////////////////////////////////////////////////////////////////////
-    // Cycle through every pair of vectors,
-    // taking into account that vector counts can be different
-    // from total number of thread blocks
-    ////////////////////////////////////////////////////////////////////////////
-    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
-    {
-        //int vectorBase = IMUL(elementN, vec);
-        //int vectorEnd  = elementN;
-
-        ////////////////////////////////////////////////////////////////////////
-        // Each accumulator cycles through vectors with
-        // stride equal to number of total number of accumulators ACCUM_N
-        // At this stage ACCUM_N is only preferred be a multiple of warp size
-        // to meet memory coalescing alignment constraints.
-        ////////////////////////////////////////////////////////////////////////
-        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
-        {
-        	GPU_Complex sum = GPU_Complex(0,0);
-
-            for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
-            {
-            	//original sample code
-                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
-            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
-            	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
-
-            	//custom code for multitap correlator
-            	// 1.resample local code for the current shift
-            	float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
-            	//Take into account that in multitap correlators, the shifts can be negative!
-            	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
-            	//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
-            	// 2.correlate
-            	sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
-
-            }
-            accumResult[iAccum] = sum;
-        }
-
-        ////////////////////////////////////////////////////////////////////////
-        // Perform tree-like reduction of accumulators' results.
-        // ACCUM_N has to be power of two at this stage
-        ////////////////////////////////////////////////////////////////////////
-        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
-        {
-            __syncthreads();
-
-            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
-            {
-                accumResult[iAccum] += accumResult[stride + iAccum];
-            }
-        }
-
-        if (threadIdx.x == 0)
-        	{
-        		d_corr_out[vec] = accumResult[0];
-        	}
-    }
-}
-
-/**
- * CUDA Kernel Device code
- *
- * Computes the carrier Doppler wipe-off by integrating the NCO in the CUDA kernel
- */
-__global__ void
-CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem_carrier_phase_in_rad, float phase_step_rad, int numElements)
-{
-	// CUDA version of floating point NCO and vector dot product integrated
-    float sin;
-    float cos;
-    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
-         i < numElements;
-         i += blockDim.x * gridDim.x)
-    {
-    	__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
-    	sig_out[i] =  sig_in[i] * GPU_Complex(cos,-sin);
-    }
-}
-
-
 __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
    GPU_Complex *d_corr_out,
    GPU_Complex *d_sig_in,
    GPU_Complex *d_sig_wiped,
    GPU_Complex *d_local_code_in,
    float *d_shifts_chips,
-    float code_length_chips,
+    int code_length_chips,
    float code_phase_step_chips,
    float rem_code_phase_chips,
    int vectorN,
@ -187,7 +90,7 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
        {
        	GPU_Complex sum = GPU_Complex(0,0);
-            float local_code_chip_index;
+            float local_code_chip_index=0.0;;
            //float code_phase;
            for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
            {
@ -202,7 +105,7 @@ __global__ void Doppler_wippe_scalarProdGPUCPXxN_shifts_chips(
            	local_code_chip_index= fmodf(code_phase_step_chips*__int2float_rd(pos)+ d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);

            	//Take into account that in multitap correlators, the shifts can be negative!
-            	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
+            	if (local_code_chip_index<0.0) local_code_chip_index+=(code_length_chips-1);
            	//printf("vec= %i, pos %i, chip_idx=%i chip_shift=%f \r\n",vec, pos,__float2int_rd(local_code_chip_index),local_code_chip_index);
            	// 2.correlate
            	sum.multiply_acc(d_sig_wiped[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
@ -240,52 +143,52 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler(
 {
 	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
 //	findCudaDevice(argc, (const char **)argv);
-//      cudaDeviceProp  prop;
-//    int num_devices, device;
-//    cudaGetDeviceCount(&num_devices);
-//    if (num_devices > 1) {
-//          int max_multiprocessors = 0, max_device = 0;
-//          for (device = 0; device < num_devices; device++) {
-//                  cudaDeviceProp properties;
-//                  cudaGetDeviceProperties(&properties, device);
-//                  if (max_multiprocessors < properties.multiProcessorCount) {
-//                          max_multiprocessors = properties.multiProcessorCount;
-//                          max_device = device;
-//                  }
-//                  printf("Found GPU device # %i\n",device);
-//          }
-//          //cudaSetDevice(max_device);
-//
-//          //set random device!
-//          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs
-//
-//          cudaGetDeviceProperties( &prop, max_device );
-//          //debug code
-//          if (prop.canMapHostMemory != 1) {
-//              printf( "Device can not map memory.\n" );
-//          }
-//          printf("L2 Cache size= %u \n",prop.l2CacheSize);
-//          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
-//          printf("maxGridSize= %i \n",prop.maxGridSize[0]);
-//          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
-//          printf("deviceOverlap= %i \n",prop.deviceOverlap);
-//  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
-//    }else{
-//    	    int whichDevice;
-//    	    cudaGetDevice( &whichDevice );
-//    	    cudaGetDeviceProperties( &prop, whichDevice );
-//    	    //debug code
-//    	    if (prop.canMapHostMemory != 1) {
-//    	        printf( "Device can not map memory.\n" );
-//    	    }
-//
-//    	    printf("L2 Cache size= %u \n",prop.l2CacheSize);
-//    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
-//    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]);
-//    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
-//    	    printf("deviceOverlap= %i \n",prop.deviceOverlap);
-//    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
-//    }
+      cudaDeviceProp  prop;
+    int num_devices, device;
+    cudaGetDeviceCount(&num_devices);
+    if (num_devices > 1) {
+          int max_multiprocessors = 0, max_device = 0;
+          for (device = 0; device < num_devices; device++) {
+                  cudaDeviceProp properties;
+                  cudaGetDeviceProperties(&properties, device);
+                  if (max_multiprocessors < properties.multiProcessorCount) {
+                          max_multiprocessors = properties.multiProcessorCount;
+                          max_device = device;
+                  }
+                  printf("Found GPU device # %i\n",device);
+          }
+          //cudaSetDevice(max_device);
+
+          //set random device!
+          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs
+
+          cudaGetDeviceProperties( &prop, max_device );
+          //debug code
+          if (prop.canMapHostMemory != 1) {
+              printf( "Device can not map memory.\n" );
+          }
+          printf("L2 Cache size= %u \n",prop.l2CacheSize);
+          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+          printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+          printf("deviceOverlap= %i \n",prop.deviceOverlap);
+  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+    }else{
+    	    int whichDevice;
+    	    cudaGetDevice( &whichDevice );
+    	    cudaGetDeviceProperties( &prop, whichDevice );
+    	    //debug code
+    	    if (prop.canMapHostMemory != 1) {
+    	        printf( "Device can not map memory.\n" );
+    	    }
+
+    	    printf("L2 Cache size= %u \n",prop.l2CacheSize);
+    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+    	    printf("deviceOverlap= %i \n",prop.deviceOverlap);
+    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+    }

 	// (cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));

@ -325,7 +228,7 @@ bool cuda_multicorrelator::init_cuda_integrated_resampler(
    // Launch the Vector Add CUDA Kernel
    // TODO: write a smart load balance using device info!
 	threadsPerBlock = 64;
-    blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
+    blocksPerGrid = 128;//(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;

 	cudaStreamCreate (&stream1) ;
 	//cudaStreamCreate (&stream2) ;
@ -358,7 +261,7 @@ bool cuda_multicorrelator::set_local_code_and_taps(
 	//******** CudaMalloc version ***********
    //local code CPU -> GPU copy memory
    cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1);
-    d_code_length_chips=(float)code_length_chips;
+    d_code_length_chips=code_length_chips;

    //Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
    cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
@ -389,6 +292,17 @@ bool cuda_multicorrelator::set_input_output_vectors(
 	return true;

 }
+
+#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess)
+   {
+      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) exit(code);
+   }
+}
+
 bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
 		float rem_carrier_phase_in_rad,
 		float phase_step_rad,
@ -398,37 +312,15 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
 		int n_correlators)
 	{

-
 	// cudaMemCpy version
 	//size_t memSize = signal_length_samples * sizeof(std::complex<float>);
 	// input signal CPU -> GPU copy memory
    //cudaMemcpyAsync(d_sig_in, d_sig_in_cpu, memSize,
    //                               cudaMemcpyHostToDevice, stream2);
-
    //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
-    //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
-    //cudaStreamSynchronize(stream2);
-
-    //CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream1>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
-
-    //wait for Doppler wipeoff end...
-    //cudaStreamSynchronize(stream1);
-    //cudaStreamSynchronize(stream2);

    //launch the multitap correlator with integrated local code resampler!

-//    scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
-//			d_corr_out,
-//			d_sig_doppler_wiped,
-//			d_local_codes_in,
-//			d_shifts_chips,
-//			d_code_length_chips,
-//	        code_phase_step_chips,
-//	        rem_code_phase_chips,
-//			n_correlators,
-//			signal_length_samples
-//		);
-
    Doppler_wippe_scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
 			d_corr_out,
 			d_sig_in,
@ -444,25 +336,15 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
 			phase_step_rad
 			);

-    //debug
-//	std::complex<float>* debug_signal;
-//	debug_signal=static_cast<std::complex<float>*>(malloc(memSize));
-//    cudaMemcpyAsync(debug_signal, d_sig_doppler_wiped, memSize,
-//            cudaMemcpyDeviceToHost,stream1);
-//    cudaStreamSynchronize(stream1);
-//	std::cout<<"d_sig_doppler_wiped GPU="<<debug_signal[456]<<","<<debug_signal[1]<<","<<debug_signal[2]<<","<<debug_signal[3]<<std::endl;
+    gpuErrchk( cudaPeekAtLastError() );
+    gpuErrchk( cudaStreamSynchronize(stream1));

-    //cudaGetLastError();
-    //wait for correlators end...
-    //cudaStreamSynchronize(stream1);
+	// cudaMemCpy version
    // Copy the device result vector in device memory to the host result vector
    // in host memory.
-
    //scalar products (correlators outputs)
-    //cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+    //cudaMemcpyAsync(d_corr_out_cpu, d_corr_out, sizeof(std::complex<float>)*n_correlators,
    //        cudaMemcpyDeviceToHost,stream1);
-
-    cudaStreamSynchronize(stream1);
    return true;
 }

@ -490,7 +372,6 @@ bool cuda_multicorrelator::free_cuda()
 	if (d_corr_out!=NULL) cudaFree(d_corr_out);
 	if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples);
 	if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips);
-
    // Reset the device and exit
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@ -155,7 +155,7 @@ private:

    int *d_shifts_samples;
    float *d_shifts_chips;
-    float d_code_length_chips;
+    int d_code_length_chips;

    int threadsPerBlock;
    int blocksPerGrid;