Updated CUDA kernels and several GPU tracking optimizations.

Bug fix in GPS_L1_CA_DLL_PLL binary dump
2025-10-31 07:13:03 +00:00 · 2015-08-06 17:05:15 +02:00
parent 26cf90cdd4
commit 1aa84cd1c4
18 changed files with 450 additions and 155 deletions
--- a/src/algorithms/tracking/adapters/CMakeLists.txt
+++ b/src/algorithms/tracking/adapters/CMakeLists.txt
@@ -18,6 +18,7 @@

 if(ENABLE_CUDA)
 	FIND_PACKAGE(CUDA REQUIRED) 
+	set(OPT_TRACKING_ADAPTERS ${OPT_TRACKING_ADAPTERS} gps_l1_ca_dll_pll_tracking_gpu.cc)
 endif(ENABLE_CUDA)

 set(TRACKING_ADAPTER_SOURCES 
@@ -30,7 +31,7 @@ set(TRACKING_ADAPTER_SOURCES
     gps_l1_ca_tcp_connector_tracking.cc
     galileo_e5a_dll_pll_tracking.cc
     gps_l2_m_dll_pll_tracking.cc
-     gps_l1_ca_dll_pll_tracking_gpu.cc
+     ${OPT_TRACKING_ADAPTERS}
 )

 include_directories(
--- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
+++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
@@ -19,6 +19,7 @@

 if(ENABLE_CUDA)
 	FIND_PACKAGE(CUDA REQUIRED)
+	set(OPT_TRACKING_BLOCKS ${OPT_TRACKING_BLOCKS} gps_l1_ca_dll_pll_tracking_gpu_cc.cc)
 endif(ENABLE_CUDA)

 set(TRACKING_GR_BLOCKS_SOURCES
@@ -31,7 +32,7 @@ set(TRACKING_GR_BLOCKS_SOURCES
     gps_l1_ca_tcp_connector_tracking_cc.cc
     galileo_e5a_dll_pll_tracking_cc.cc
     gps_l2_m_dll_pll_tracking_cc.cc
-     gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+	 ${OPT_TRACKING_BLOCKS}   
 )

 include_directories(
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_cc.cc
@@ -594,7 +594,8 @@ int Gps_L1_Ca_Dll_Pll_Tracking_cc::general_work (int noutput_items, gr_vector_in

                    // carrier and code frequency
                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(float));
-                    d_dump_file.write(reinterpret_cast<char*>(&d_code_freq_chips), sizeof(float));
+                    tmp_float=d_code_freq_chips;
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float));

                    //PLL commands
                    d_dump_file.write(reinterpret_cast<char*>(&carr_error_hz), sizeof(float));
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
@@ -81,7 +81,6 @@ gps_l1_ca_dll_pll_make_tracking_gpu_cc(
 }


-
 void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::forecast (int noutput_items,
        gr_vector_int &ninput_items_required)
 {
@@ -120,14 +119,19 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(

    // Initialization of local code replica
    // Get space for a vector with the C/A code replica sampled 1x/chip
-    d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS + 2) * sizeof(gr_complex), volk_get_alignment()));
-
+    //d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS + 2) * sizeof(gr_complex), volk_get_alignment()));
+    d_ca_code = static_cast<gr_complex*>(volk_malloc((GPS_L1_CA_CODE_LENGTH_CHIPS) * sizeof(gr_complex), volk_get_alignment()));

    multicorrelator_gpu = new cuda_multicorrelator();
    int N_CORRELATORS=3;
-    multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS);
+    //local code resampler on CPU (old)
+    //multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS);
+
+    //local code resampler on GPU (new)
+    multicorrelator_gpu->init_cuda_integrated_resampler(0, NULL, 2 * d_vector_length , GPS_L1_CA_CODE_LENGTH_CHIPS , N_CORRELATORS);
+
    // Get space for the resampled early / prompt / late local replicas
-	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_samples, N_CORRELATORS * sizeof(int),  cudaHostAllocMapped ));
+	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_chips, N_CORRELATORS * sizeof(float),  cudaHostAllocMapped ));


    //allocate host memory
@@ -138,7 +142,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(
 	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (V_LEN * sizeof(gr_complex))*N_CORRELATORS, cudaHostAllocWriteCombined ));

 	//new integrated shifts
-	checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));
+	//checkCudaErrors(cudaHostAlloc((void**)&d_local_codes_gpu, (2 * d_vector_length * sizeof(gr_complex)), cudaHostAllocWriteCombined ));

 	// correlator outputs (scalar)
 	checkCudaErrors(cudaHostAlloc((void**)&d_corr_outs_gpu ,sizeof(gr_complex)*N_CORRELATORS,  cudaHostAllocWriteCombined ));
@@ -242,9 +246,13 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking()
    d_code_loop_filter.initialize();    // initialize the code filter

    // generate local reference ALWAYS starting at chip 1 (1 sample per chip)
-    gps_l1_ca_code_gen_complex(&d_ca_code[1], d_acquisition_gnss_synchro->PRN, 0);
-    d_ca_code[0] = d_ca_code[static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS)];
-    d_ca_code[static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS) + 1] = d_ca_code[1];
+    gps_l1_ca_code_gen_complex(d_ca_code, d_acquisition_gnss_synchro->PRN, 0);
+
+    d_local_code_shift_chips[0]=-d_early_late_spc_chips;
+    d_local_code_shift_chips[1]=0.0;
+    d_local_code_shift_chips[2]=d_early_late_spc_chips;
+
+    multicorrelator_gpu->set_local_code_and_taps(GPS_L1_CA_CODE_LENGTH_CHIPS,d_ca_code, d_local_code_shift_chips,3);

    d_carrier_lock_fail_counter = 0;
    d_rem_code_phase_samples = 0;
@@ -272,40 +280,6 @@ void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::start_tracking()
 }


-
-
-
-void Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::update_local_code()
-{
-    double tcode_chips;
-    double rem_code_phase_chips;
-    int associated_chip_index;
-    int code_length_chips = static_cast<int>(GPS_L1_CA_CODE_LENGTH_CHIPS);
-    double code_phase_step_chips;
-    int epl_loop_length_samples;
-
-    // unified loop for E, P, L code vectors
-    code_phase_step_chips = static_cast<double>(d_code_freq_chips) / static_cast<double>(d_fs_in);
-    rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in);
-    tcode_chips = -rem_code_phase_chips;
-
-    // Alternative EPL code generation (40% of speed improvement!)
-    d_local_code_shift_samples[0]=0;
-    d_local_code_shift_samples[1]=round(d_early_late_spc_chips / code_phase_step_chips);
-    d_local_code_shift_samples[2]=round((2*d_early_late_spc_chips) / code_phase_step_chips);
-
-    epl_loop_length_samples = d_current_prn_length_samples + d_local_code_shift_samples[2]; //maximum length
-
-    for (int i = 0; i < epl_loop_length_samples; i++)
-        {
-            associated_chip_index = 1 + round(fmod(tcode_chips - d_early_late_spc_chips, code_length_chips));
-            d_local_codes_gpu[i] = d_ca_code[associated_chip_index];
-            tcode_chips = tcode_chips + code_phase_step_chips;
-        }
-
-}
-
-
 Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
 {
    d_dump_file.close();
@@ -313,7 +287,7 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
 	cudaFreeHost(in_gpu);
 	cudaFreeHost(d_carr_sign_gpu);
 	cudaFreeHost(d_corr_outs_gpu);
-	cudaFreeHost(d_local_codes_gpu);
+	cudaFreeHost(d_local_code_shift_chips);

 	multicorrelator_gpu->free_cuda();
 	delete(multicorrelator_gpu);
@@ -329,10 +303,10 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
        gr_vector_const_void_star &input_items, gr_vector_void_star &output_items)
 {
    // process vars
-    float carr_error_hz;
-    float carr_error_filt_hz;
-    float code_error_chips;
-    float code_error_filt_chips;
+    float carr_error_hz=0.0;
+    float carr_error_filt_hz=0.0;
+    float code_error_chips=0.0;
+    float code_error_filt_chips=0.0;

    // Block input data and block output stream pointers
    const gr_complex* in = (gr_complex*) input_items[0];
@@ -341,23 +315,17 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
    // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder
    Gnss_Synchro current_synchro_data = Gnss_Synchro();

-
    if (d_enable_tracking == true)
        {
            // Receiver signal alignment
            if (d_pull_in == true)
                {
                    int samples_offset;
-                    float acq_trk_shif_correction_samples;
                    int acq_to_trk_delay_samples;
                    acq_to_trk_delay_samples = d_sample_counter - d_acq_sample_stamp;
-                    acq_trk_shif_correction_samples = d_current_prn_length_samples - fmod(static_cast<float>(acq_to_trk_delay_samples), static_cast<float>(d_current_prn_length_samples));
-                    samples_offset = round(d_acq_code_phase_samples + acq_trk_shif_correction_samples);
-                    // /todo: Check if the sample counter sent to the next block as a time reference should be incremented AFTER sended or BEFORE
-                    //d_sample_counter_seconds = d_sample_counter_seconds + (((double)samples_offset) / static_cast<double>(d_fs_in));
+                    samples_offset = round(d_acq_code_phase_samples)+d_current_prn_length_samples - acq_to_trk_delay_samples%d_current_prn_length_samples;
                    d_sample_counter = d_sample_counter + samples_offset; //count for the processed samples
                    d_pull_in = false;
-                    //std::cout<<" samples_offset="<<samples_offset<<"\r\n";
                    // Fill the acquisition data
                    current_synchro_data = *d_acquisition_gnss_synchro;
                    *out[0] = current_synchro_data;
@@ -368,46 +336,24 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
            // Fill the acquisition data
            current_synchro_data = *d_acquisition_gnss_synchro;

-            // Generate local code and carrier replicas (using \hat{f}_d(k-1))
-            update_local_code();
-
            // UPDATE NCO COMMAND
            float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
-            //std::cout<<"d_current_prn_length_samples="<<d_current_prn_length_samples<<std::endl;
-            // perform carrier wipe-off and compute Early, Prompt and Late correlation
-        	cudaProfilerStart();
-            multicorrelator_gpu->Carrier_wipeoff_multicorrelator_cuda(
+
+        	//code resampler on GPU (new)
+            float code_phase_step_chips = static_cast<float>(d_code_freq_chips) / static_cast<float>(d_fs_in);
+            float rem_code_phase_chips = d_rem_code_phase_samples * (d_code_freq_chips / d_fs_in);
+
+            cudaProfilerStart();
+            multicorrelator_gpu->Carrier_wipeoff_multicorrelator_resampler_cuda(
    				d_corr_outs_gpu,
    				in,
-    				d_local_codes_gpu,
    				d_rem_carr_phase_rad,
    				phase_step_rad,
-    				d_local_code_shift_samples,
+    				code_phase_step_chips,
+    				rem_code_phase_chips,
    				d_current_prn_length_samples,
    				3);
        	cudaProfilerStop();
-            //std::cout<<"d_Prompt="<<*d_Prompt<<"d_Early="<<*d_Early<<"d_Late="<<*d_Late<<std::endl;
-            // check for samples consistency (this should be done before in the receiver / here only if the source is a file)
-            if (std::isnan((*d_Prompt).real()) == true or std::isnan((*d_Prompt).imag()) == true ) // or std::isinf(in[i].real())==true or std::isinf(in[i].imag())==true)
-                {
-                    const int samples_available = ninput_items[0];
-                    d_sample_counter = d_sample_counter + samples_available;
-                    LOG(WARNING) << "Detected NaN samples at sample number " << d_sample_counter;
-                    consume_each(samples_available);
-
-                    // make an output to not stop the rest of the processing blocks
-                    current_synchro_data.Prompt_I = 0.0;
-                    current_synchro_data.Prompt_Q = 0.0;
-                    current_synchro_data.Tracking_timestamp_secs = static_cast<double>(d_sample_counter) / static_cast<double>(d_fs_in);
-                    current_synchro_data.Carrier_phase_rads = 0.0;
-                    current_synchro_data.Code_phase_secs = 0.0;
-                    current_synchro_data.CN0_dB_hz = 0.0;
-                    current_synchro_data.Flag_valid_tracking = false;
-                    current_synchro_data.Flag_valid_pseudorange = false;
-
-                    *out[0] = current_synchro_data;
-                    return 1;
-                }

            // ################## PLL ##########################################################
            // PLL discriminator
@@ -444,8 +390,7 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto
            T_chip_seconds = 1 / static_cast<double>(d_code_freq_chips);
            T_prn_seconds = T_chip_seconds * GPS_L1_CA_CODE_LENGTH_CHIPS;
            T_prn_samples = T_prn_seconds * static_cast<double>(d_fs_in);
-            K_blk_samples = T_prn_samples + d_rem_code_phase_samples + code_error_filt_secs * static_cast<double>(d_fs_in);
-            d_current_prn_length_samples = round(K_blk_samples); //round to a discrete samples
+            K_blk_samples = T_prn_samples + d_rem_code_phase_samples + static_cast<double>(code_error_filt_secs) * static_cast<double>(d_fs_in);
            //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample

            // ####### CN0 ESTIMATION AND LOCK DETECTORS ######
@@ -591,7 +536,8 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto

                    // carrier and code frequency
                    d_dump_file.write(reinterpret_cast<char*>(&d_carrier_doppler_hz), sizeof(float));
-                    d_dump_file.write(reinterpret_cast<char*>(&d_code_freq_chips), sizeof(float));
+                    tmp_float=d_code_freq_chips;
+                    d_dump_file.write(reinterpret_cast<char*>(&tmp_float), sizeof(float));

                    //PLL commands
                    d_dump_file.write(reinterpret_cast<char*>(&carr_error_hz), sizeof(float));
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.h
@@ -130,7 +130,7 @@ private:
    gr_complex* in_gpu;
    gr_complex* d_carr_sign_gpu;
    gr_complex* d_local_codes_gpu;
-	int* d_local_code_shift_samples;
+	float* d_local_code_shift_chips;
    gr_complex* d_corr_outs_gpu;
    cuda_multicorrelator *multicorrelator_gpu;

--- a/src/algorithms/tracking/libs/CMakeLists.txt
+++ b/src/algorithms/tracking/libs/CMakeLists.txt
@@ -33,7 +33,6 @@ if(ENABLE_CUDA)

 	SET(LIB_TYPE STATIC) #set the lib type
 	CUDA_ADD_LIBRARY(CUDA_CORRELATOR_LIB ${LIB_TYPE} cuda_multicorrelator.h cuda_multicorrelator.cu)
-    
 endif(ENABLE_CUDA)


--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -53,7 +53,83 @@
 #include <helper_cuda.h>
 #include <helper_functions.h>

-#define ACCUM_N 1024
+#define ACCUM_N 256
+
+
+__global__ void scalarProdGPUCPXxN_shifts_chips(
+    GPU_Complex *d_corr_out,
+    GPU_Complex *d_sig_in,
+    GPU_Complex *d_local_code_in,
+    float *d_shifts_chips,
+    float code_length_chips,
+    float code_phase_step_chips,
+    float rem_code_phase_chips,
+    int vectorN,
+    int elementN
+)
+{
+    //Accumulators cache
+    __shared__ GPU_Complex accumResult[ACCUM_N];
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Cycle through every pair of vectors,
+    // taking into account that vector counts can be different
+    // from total number of thread blocks
+    ////////////////////////////////////////////////////////////////////////////
+    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
+    {
+        //int vectorBase = IMUL(elementN, vec);
+        //int vectorEnd  = elementN;
+
+        ////////////////////////////////////////////////////////////////////////
+        // Each accumulator cycles through vectors with
+        // stride equal to number of total number of accumulators ACCUM_N
+        // At this stage ACCUM_N is only preferred be a multiple of warp size
+        // to meet memory coalescing alignment constraints.
+        ////////////////////////////////////////////////////////////////////////
+        for (int iAccum = threadIdx.x; iAccum < ACCUM_N; iAccum += blockDim.x)
+        {
+        	GPU_Complex sum = GPU_Complex(0,0);
+
+            for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
+            {
+                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
+            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
+            	//sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos+d_shifts_samples[vec]]);
+
+            	// 1.resample local code for the current shift
+            	float local_code_chip_index= fmod(code_phase_step_chips*(float)pos + d_shifts_chips[vec] - rem_code_phase_chips, code_length_chips);
+            	//TODO: Take into account that in multitap correlators, the shifts can be negative!
+            	if (local_code_chip_index<0.0) local_code_chip_index+=code_length_chips;
+
+            	// 2.correlate
+            	sum.multiply_acc(d_sig_in[pos],d_local_code_in[__float2int_rd(local_code_chip_index)]);
+
+            }
+            accumResult[iAccum] = sum;
+        }
+
+        ////////////////////////////////////////////////////////////////////////
+        // Perform tree-like reduction of accumulators' results.
+        // ACCUM_N has to be power of two at this stage
+        ////////////////////////////////////////////////////////////////////////
+        for (int stride = ACCUM_N / 2; stride > 0; stride >>= 1)
+        {
+            __syncthreads();
+
+            for (int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x)
+            {
+                accumResult[iAccum] += accumResult[stride + iAccum];
+            }
+        }
+
+        if (threadIdx.x == 0)
+        	{
+        		d_corr_out[vec] = accumResult[0];
+        	}
+    }
+}
+

 ///////////////////////////////////////////////////////////////////////////////
 // Calculate scalar products of VectorN vectors of ElementN elements on GPU
@@ -145,8 +221,9 @@ __global__ void scalarProdGPUCPXxN(
    ////////////////////////////////////////////////////////////////////////////
    for (int vec = blockIdx.x; vec < vectorN; vec += gridDim.x)
    {
-        int vectorBase = IMUL(elementN, vec);
-        int vectorEnd  = vectorBase + elementN;
+        //int vectorBase = IMUL(elementN, vec);
+        //int vectorEnd  = vectorBase + elementN;
+

        ////////////////////////////////////////////////////////////////////////
        // Each accumulator cycles through vectors with
@@ -158,11 +235,13 @@ __global__ void scalarProdGPUCPXxN(
        {
        	GPU_Complex sum = GPU_Complex(0,0);

-            for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
+            //for (int pos = vectorBase + iAccum; pos < vectorEnd; pos += ACCUM_N)
+        	for (int pos = iAccum; pos < elementN; pos += ACCUM_N)
            {
                //sum = sum + d_sig_in[pos-vectorBase] * d_nco_in[pos-vectorBase] * d_local_codes_in[pos];
            	//sum = sum + d_sig_in[pos-vectorBase] * d_local_codes_in[pos];
-            	sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]);
+            	//sum.multiply_acc(d_sig_in[pos-vectorBase],d_local_codes_in[pos]);
+        		sum.multiply_acc(d_sig_in[pos],d_local_codes_in[pos]);
            }
            accumResult[iAccum] = sum;
        }
@@ -200,9 +279,9 @@ __global__ void scalarProdGPUCPXxN(
 */
 __global__ void CUDA_32fc_x2_multiply_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements)
 {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < numElements)
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < numElements;
+         i += blockDim.x * gridDim.x)
    {
        C[i] =  A[i] * B[i];
    }
@@ -232,10 +311,11 @@ CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem

 	// CUDA version of floating point NCO and vector dot product integrated

-    int i = blockDim.x * blockIdx.x + threadIdx.x;
    float sin;
    float cos;
-    if (i < numElements)
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < numElements;
+         i += blockDim.x * gridDim.x)
    {
    	__sincosf(rem_carrier_phase_in_rad + i*phase_step_rad, &sin, &cos);
    	sig_out[i] =  sig_in[i] * GPU_Complex(cos,-sin);
@@ -252,11 +332,11 @@ CUDA_32fc_Doppler_wipeoff(  GPU_Complex *sig_out, GPU_Complex *sig_in, float rem
 __global__ void
 CUDA_32fc_x2_add_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int numElements)
 {
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < numElements)
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < numElements;
+         i += blockDim.x * gridDim.x)
    {
-        C[i] =  A[i] * B[i];
+        C[i] =  A[i] + B[i];
    }
 }

@@ -264,23 +344,53 @@ CUDA_32fc_x2_add_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int
 bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators)
 {
 	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
-	findCudaDevice(argc, (const char **)argv);
-
-    cudaDeviceProp  prop;
-    int whichDevice;
-    cudaGetDevice( &whichDevice );
-    cudaGetDeviceProperties( &prop, whichDevice );
-    //debug code
-    if (prop.canMapHostMemory != 1) {
-        printf( "Device can not map memory.\n" );
-    }
-
-    printf("L2 Cache size= %u \n",prop.l2CacheSize);
-    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
-    printf("maxGridSize= %i \n",prop.maxGridSize[0]);
-    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
-    printf("deviceOverlap= %i \n",prop.deviceOverlap);
-    //end debug code
+//	findCudaDevice(argc, (const char **)argv);
+//      cudaDeviceProp  prop;
+//    int num_devices, device;
+//    cudaGetDeviceCount(&num_devices);
+//    if (num_devices > 1) {
+//          int max_multiprocessors = 0, max_device = 0;
+//          for (device = 0; device < num_devices; device++) {
+//                  cudaDeviceProp properties;
+//                  cudaGetDeviceProperties(&properties, device);
+//                  if (max_multiprocessors < properties.multiProcessorCount) {
+//                          max_multiprocessors = properties.multiProcessorCount;
+//                          max_device = device;
+//                  }
+//                  printf("Found GPU device # %i\n",device);
+//          }
+//          //cudaSetDevice(max_device);
+//
+//          //set random device!
+//          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs
+//
+//          cudaGetDeviceProperties( &prop, max_device );
+//          //debug code
+//          if (prop.canMapHostMemory != 1) {
+//              printf( "Device can not map memory.\n" );
+//          }
+//          printf("L2 Cache size= %u \n",prop.l2CacheSize);
+//          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+//          printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+//          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+//          printf("deviceOverlap= %i \n",prop.deviceOverlap);
+//  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+//    }else{
+//    	    int whichDevice;
+//    	    cudaGetDevice( &whichDevice );
+//    	    cudaGetDeviceProperties( &prop, whichDevice );
+//    	    //debug code
+//    	    if (prop.canMapHostMemory != 1) {
+//    	        printf( "Device can not map memory.\n" );
+//    	    }
+//
+//    	    printf("L2 Cache size= %u \n",prop.l2CacheSize);
+//    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+//    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+//    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+//    	    printf("deviceOverlap= %i \n",prop.deviceOverlap);
+//    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+//    }

 	//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));

@@ -300,7 +410,7 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign
 	// Required: The last correlator tap in d_shifts_samples has the largest sample shift
    size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex);
 	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes));
-	checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, size+sizeof(int)*n_correlators));
+	checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, sizeof(int)*n_correlators));

 	//scalars
 	checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
@@ -315,6 +425,116 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign
 }


+bool cuda_multicorrelator::init_cuda_integrated_resampler(
+		const int argc, const char **argv,
+		int signal_length_samples,
+		int code_length_chips,
+		int n_correlators
+		)
+{
+	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
+//	findCudaDevice(argc, (const char **)argv);
+//      cudaDeviceProp  prop;
+//    int num_devices, device;
+//    cudaGetDeviceCount(&num_devices);
+//    if (num_devices > 1) {
+//          int max_multiprocessors = 0, max_device = 0;
+//          for (device = 0; device < num_devices; device++) {
+//                  cudaDeviceProp properties;
+//                  cudaGetDeviceProperties(&properties, device);
+//                  if (max_multiprocessors < properties.multiProcessorCount) {
+//                          max_multiprocessors = properties.multiProcessorCount;
+//                          max_device = device;
+//                  }
+//                  printf("Found GPU device # %i\n",device);
+//          }
+//          //cudaSetDevice(max_device);
+//
+//          //set random device!
+//          cudaSetDevice(rand() % num_devices); //generates a random number between 0 and num_devices to split the threads between GPUs
+//
+//          cudaGetDeviceProperties( &prop, max_device );
+//          //debug code
+//          if (prop.canMapHostMemory != 1) {
+//              printf( "Device can not map memory.\n" );
+//          }
+//          printf("L2 Cache size= %u \n",prop.l2CacheSize);
+//          printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+//          printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+//          printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+//          printf("deviceOverlap= %i \n",prop.deviceOverlap);
+//  	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+//    }else{
+//    	    int whichDevice;
+//    	    cudaGetDevice( &whichDevice );
+//    	    cudaGetDeviceProperties( &prop, whichDevice );
+//    	    //debug code
+//    	    if (prop.canMapHostMemory != 1) {
+//    	        printf( "Device can not map memory.\n" );
+//    	    }
+//
+//    	    printf("L2 Cache size= %u \n",prop.l2CacheSize);
+//    	    printf("maxThreadsPerBlock= %u \n",prop.maxThreadsPerBlock);
+//    	    printf("maxGridSize= %i \n",prop.maxGridSize[0]);
+//    	    printf("sharedMemPerBlock= %lu \n",prop.sharedMemPerBlock);
+//    	    printf("deviceOverlap= %i \n",prop.deviceOverlap);
+//    	    printf("multiProcessorCount= %i \n",prop.multiProcessorCount);
+//    }
+
+	//checkCudaErrors(cudaFuncSetCacheConfig(CUDA_32fc_x2_multiply_x2_dot_prod_32fc_, cudaFuncCachePreferShared));
+
+    // ALLOCATE GPU MEMORY FOR INPUT/OUTPUT and INTERNAL vectors
+
+    size_t size = signal_length_samples * sizeof(GPU_Complex);
+
+	checkCudaErrors(cudaMalloc((void **)&d_sig_in, size));
+	checkCudaErrors(cudaMemset(d_sig_in,0,size));
+
+	//checkCudaErrors(cudaMalloc((void **)&d_nco_in, size));
+	checkCudaErrors(cudaMalloc((void **)&d_sig_doppler_wiped, size));
+	checkCudaErrors(cudaMemset(d_sig_doppler_wiped,0,size));
+
+	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, sizeof(std::complex<float>)*code_length_chips));
+	checkCudaErrors(cudaMemset(d_local_codes_in,0,sizeof(std::complex<float>)*code_length_chips));
+
+    d_code_length_chips=code_length_chips;
+
+	checkCudaErrors(cudaMalloc((void **)&d_shifts_chips, sizeof(float)*n_correlators));
+	checkCudaErrors(cudaMemset(d_shifts_chips,0,sizeof(float)*n_correlators));
+
+	//scalars
+	checkCudaErrors(cudaMalloc((void **)&d_corr_out, sizeof(std::complex<float>)*n_correlators));
+	checkCudaErrors(cudaMemset(d_corr_out,0,sizeof(std::complex<float>)*n_correlators));
+
+    // Launch the Vector Add CUDA Kernel
+	threadsPerBlock = 256;
+    blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;
+
+	cudaStreamCreate (&stream1) ;
+	cudaStreamCreate (&stream2) ;
+	return true;
+}
+
+bool cuda_multicorrelator::set_local_code_and_taps(
+		int code_length_chips,
+		const std::complex<float>* local_codes_in,
+		float *shifts_chips,
+		int n_correlators
+		)
+{
+    // local code CPU -> GPU copy memory
+    checkCudaErrors(cudaMemcpyAsync(d_local_codes_in, local_codes_in, sizeof(GPU_Complex)*code_length_chips, cudaMemcpyHostToDevice,stream1));
+    d_code_length_chips=(float)code_length_chips;
+
+    // Correlator shifts vector CPU -> GPU copy memory (fractional chip shifts are allowed!)
+    checkCudaErrors(cudaMemcpyAsync(d_shifts_chips, shifts_chips, sizeof(float)*n_correlators,
+                                    cudaMemcpyHostToDevice,stream1));
+
+	return true;
+}
+
+
+
 bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
 		std::complex<float>* corr_out,
 		const std::complex<float>* sig_in,
@@ -396,13 +616,88 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
    return true;
 }

+bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_resampler_cuda(
+		std::complex<float>* corr_out,
+		const std::complex<float>* sig_in,
+		float rem_carrier_phase_in_rad,
+		float phase_step_rad,
+        float code_phase_step_chips,
+        float rem_code_phase_chips,
+		int signal_length_samples,
+		int n_correlators)
+	{
+
+	size_t memSize = signal_length_samples * sizeof(std::complex<float>);
+	// input signal CPU -> GPU copy memory
+    checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
+                                    cudaMemcpyHostToDevice, stream2));
+
+    //***** NOTICE: NCO is computed on-the-fly, not need to copy NCO into GPU! ****
+
+    //Launch carrier wipe-off kernel here, while local codes are being copied to GPU!
+    checkCudaErrors(cudaStreamSynchronize(stream2));
+
+    CUDA_32fc_Doppler_wipeoff<<<blocksPerGrid, threadsPerBlock,0, stream2>>>(d_sig_doppler_wiped, d_sig_in,rem_carrier_phase_in_rad,phase_step_rad, signal_length_samples);
+
+    //wait for Doppler wipeoff end...
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+    checkCudaErrors(cudaStreamSynchronize(stream2));
+
+    //launch the multitap correlator with integrated local code resampler!
+
+    scalarProdGPUCPXxN_shifts_chips<<<blocksPerGrid, threadsPerBlock,0 ,stream1>>>(
+			d_corr_out,
+			d_sig_doppler_wiped,
+			d_local_codes_in,
+			d_shifts_chips,
+			d_code_length_chips,
+	        code_phase_step_chips,
+	        rem_code_phase_chips,
+			n_correlators,
+			signal_length_samples
+		);
+
+    checkCudaErrors(cudaGetLastError());
+    //wait for correlators end...
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+    // Copy the device result vector in device memory to the host result vector
+    // in host memory.
+
+    //scalar products (correlators outputs)
+    checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+            cudaMemcpyDeviceToHost,stream1));
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+    return true;
+}
+
+
+cuda_multicorrelator::cuda_multicorrelator()
+{
+	d_sig_in=NULL;
+	d_nco_in=NULL;
+	d_sig_doppler_wiped=NULL;
+	d_local_codes_in=NULL;
+	d_shifts_samples=NULL;
+	d_shifts_chips=NULL;
+	d_corr_out=NULL;
+	threadsPerBlock=0;
+	blocksPerGrid=0;
+	d_code_length_chips=0;
+}
+
 bool cuda_multicorrelator::free_cuda()
 {
 	// Free device global memory
-	cudaFree(d_sig_in);
-	//cudaFree(d_nco_in);
-	cudaFree(d_local_codes_in);
-	cudaFree(d_corr_out);
+	if (d_sig_in!=NULL) cudaFree(d_sig_in);
+	if (d_nco_in!=NULL) cudaFree(d_nco_in);
+	if (d_sig_doppler_wiped!=NULL) cudaFree(d_sig_doppler_wiped);
+	if (d_local_codes_in!=NULL) cudaFree(d_local_codes_in);
+	if (d_corr_out!=NULL) cudaFree(d_corr_out);
+
+
+	if (d_shifts_samples!=NULL) cudaFree(d_shifts_samples);
+	if (d_shifts_chips!=NULL) cudaFree(d_shifts_chips);
+

 	cudaStreamDestroy(stream1) ;
 	cudaStreamDestroy(stream2) ;
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -113,8 +113,20 @@ struct GPU_Complex_Short {
 class cuda_multicorrelator
 {
 public:
+	cuda_multicorrelator();
 	bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);
-
+	bool init_cuda_integrated_resampler(
+			const int argc, const char **argv,
+			int signal_length_samples,
+			int code_length_chips,
+			int n_correlators
+			);
+	bool set_local_code_and_taps(
+			int code_length_chips,
+			const std::complex<float>* local_codes_in,
+			float *shifts_chips,
+			int n_correlators
+			);
 	bool free_cuda();
 	bool Carrier_wipeoff_multicorrelator_cuda(
 			std::complex<float>* corr_out,
@@ -125,6 +137,15 @@ public:
 			const int *shifts_samples,
 			int signal_length_samples,
 			int n_correlators);
+	bool Carrier_wipeoff_multicorrelator_resampler_cuda(
+			std::complex<float>* corr_out,
+			const std::complex<float>* sig_in,
+			float rem_carrier_phase_in_rad,
+			float phase_step_rad,
+	        float code_phase_step_chips,
+	        float rem_code_phase_chips,
+			int signal_length_samples,
+			int n_correlators);
 private:
 	// Allocate the device input vectors
 	GPU_Complex *d_sig_in;
@@ -133,6 +154,9 @@ private:
 	GPU_Complex *d_local_codes_in;
 	GPU_Complex *d_corr_out;
 	int *d_shifts_samples;
+	float *d_shifts_chips;
+	float d_code_length_chips;
+
 	int threadsPerBlock;
 	int blocksPerGrid;

--- a/src/algorithms/tracking/libs/tracking_2nd_PLL_filter.cc
+++ b/src/algorithms/tracking/libs/tracking_2nd_PLL_filter.cc
@@ -94,7 +94,7 @@ Tracking_2nd_PLL_filter::Tracking_2nd_PLL_filter ()
 {
    //--- PLL variables --------------------------------------------------------
    d_pdi_carr = 0.001;// Summation interval for carrier
-    d_plldampingratio = 0.65;
+    d_plldampingratio = 0.7;
 }