First working version of the GPU GPS tracking block (it requires NVIDIA

CUDA 3.0 GPU hardware)
2025-11-07 02:33:03 +00:00 · 2015-07-24 17:21:25 +02:00
parent f722f5f8f7
commit 26cf90cdd4
12 changed files with 631 additions and 36 deletions
--- a/src/algorithms/tracking/adapters/CMakeLists.txt
+++ b/src/algorithms/tracking/adapters/CMakeLists.txt
@@ -16,6 +16,9 @@
 # along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
 #

+if(ENABLE_CUDA)
+	FIND_PACKAGE(CUDA REQUIRED) 
+endif(ENABLE_CUDA)

 set(TRACKING_ADAPTER_SOURCES 
     galileo_e1_dll_pll_veml_tracking.cc
@@ -27,6 +30,7 @@ set(TRACKING_ADAPTER_SOURCES
     gps_l1_ca_tcp_connector_tracking.cc
     galileo_e5a_dll_pll_tracking.cc
     gps_l2_m_dll_pll_tracking.cc
+     gps_l1_ca_dll_pll_tracking_gpu.cc
 )

 include_directories(
@@ -40,6 +44,7 @@ include_directories(
     ${GLOG_INCLUDE_DIRS}
     ${GFlags_INCLUDE_DIRS}
     ${GNURADIO_RUNTIME_INCLUDE_DIRS}
+     ${CUDA_INCLUDE_DIRS}
 )

 file(GLOB TRACKING_ADAPTER_HEADERS "*.h")
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.cc
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.cc
@@ -0,0 +1,158 @@
+/*!
+ * \file gps_l1_ca_dll_pll_tracking_gpu.cc
+ * \brief Implementation of an adapter of a DLL+PLL tracking loop block using GPU accelerated functions
+ * for GPS L1 C/A to a TrackingInterface
+ * \author Javier Arribas, 2015. jarribas(at)cttc.es
+ *
+ * Code DLL + carrier PLL according to the algorithms described in:
+ * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
+ * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
+ * Approach, Birkhauser, 2007
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+
+#include "gps_l1_ca_dll_pll_tracking_gpu.h"
+#include <glog/logging.h>
+#include "GPS_L1_CA.h"
+#include "configuration_interface.h"
+
+
+using google::LogMessage;
+
+GpsL1CaDllPllTrackingGPU::GpsL1CaDllPllTrackingGPU(
+        ConfigurationInterface* configuration, std::string role,
+        unsigned int in_streams, unsigned int out_streams,
+        boost::shared_ptr<gr::msg_queue> queue) :
+                role_(role), in_streams_(in_streams), out_streams_(out_streams),
+                queue_(queue)
+{
+    DLOG(INFO) << "role " << role;
+    //################# CONFIGURATION PARAMETERS ########################
+    int fs_in;
+    int vector_length;
+    int f_if;
+    bool dump;
+    std::string dump_filename;
+    std::string item_type;
+    std::string default_item_type = "gr_complex";
+    float pll_bw_hz;
+    float dll_bw_hz;
+    float early_late_space_chips;
+    item_type = configuration->property(role + ".item_type", default_item_type);
+    //vector_length = configuration->property(role + ".vector_length", 2048);
+    fs_in = configuration->property("GNSS-SDR.internal_fs_hz", 2048000);
+    f_if = configuration->property(role + ".if", 0);
+    dump = configuration->property(role + ".dump", false);
+    pll_bw_hz = configuration->property(role + ".pll_bw_hz", 50.0);
+    dll_bw_hz = configuration->property(role + ".dll_bw_hz", 2.0);
+    early_late_space_chips = configuration->property(role + ".early_late_space_chips", 0.5);
+    std::string default_dump_filename = "./track_ch";
+    dump_filename = configuration->property(role + ".dump_filename",
+            default_dump_filename); //unused!
+    vector_length = std::round(fs_in / (GPS_L1_CA_CODE_RATE_HZ / GPS_L1_CA_CODE_LENGTH_CHIPS));
+
+    //################# MAKE TRACKING GNURadio object ###################
+    if (item_type.compare("gr_complex") == 0)
+        {
+            item_size_ = sizeof(gr_complex);
+            tracking_ = gps_l1_ca_dll_pll_make_tracking_gpu_cc(
+                    f_if,
+                    fs_in,
+                    vector_length,
+                    queue_,
+                    dump,
+                    dump_filename,
+                    pll_bw_hz,
+                    dll_bw_hz,
+                    early_late_space_chips);
+        }
+    else
+        {
+            item_size_ = sizeof(gr_complex);
+            LOG(WARNING) << item_type << " unknown tracking item type.";
+        }
+    channel_ = 0;
+    channel_internal_queue_ = 0;
+    DLOG(INFO) << "tracking(" << tracking_->unique_id() << ")";
+}
+
+
+GpsL1CaDllPllTrackingGPU::~GpsL1CaDllPllTrackingGPU()
+{}
+
+
+void GpsL1CaDllPllTrackingGPU::start_tracking()
+{
+    tracking_->start_tracking();
+}
+
+/*
+ * Set tracking channel unique ID
+ */
+void GpsL1CaDllPllTrackingGPU::set_channel(unsigned int channel)
+{
+    channel_ = channel;
+    tracking_->set_channel(channel);
+}
+
+/*
+ * Set tracking channel internal queue
+ */
+void GpsL1CaDllPllTrackingGPU::set_channel_queue(
+        concurrent_queue<int> *channel_internal_queue)
+{
+    channel_internal_queue_ = channel_internal_queue;
+    tracking_->set_channel_queue(channel_internal_queue_);
+}
+
+void GpsL1CaDllPllTrackingGPU::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro)
+{
+    tracking_->set_gnss_synchro(p_gnss_synchro);
+}
+
+void GpsL1CaDllPllTrackingGPU::connect(gr::top_block_sptr top_block)
+{
+	if(top_block) { /* top_block is not null */};
+	//nothing to connect, now the tracking uses gr_sync_decimator
+}
+
+void GpsL1CaDllPllTrackingGPU::disconnect(gr::top_block_sptr top_block)
+{
+	if(top_block) { /* top_block is not null */};
+	//nothing to disconnect, now the tracking uses gr_sync_decimator
+}
+
+gr::basic_block_sptr GpsL1CaDllPllTrackingGPU::get_left_block()
+{
+    return tracking_;
+}
+
+gr::basic_block_sptr GpsL1CaDllPllTrackingGPU::get_right_block()
+{
+    return tracking_;
+}
+
--- a/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
+++ b/src/algorithms/tracking/adapters/gps_l1_ca_dll_pll_tracking_gpu.h
@@ -0,0 +1,113 @@
+/*!
+ * \file gps_l1_ca_dll_pll_tracking_gpu.h
+ * \brief Implementation of an adapter of a DLL+PLL tracking loop block using GPU accelerated functions
+ * for GPS L1 C/A to a TrackingInterface
+ * \author Javier Arribas, 2015. jarribas(at)cttc.es
+ *
+ * Code DLL + carrier PLL according to the algorithms described in:
+ * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
+ * A Software-Defined GPS and Galileo Receiver. A Single-Frequency
+ * Approach, Birkha user, 2007
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2015  (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ *          Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_H_
+#define GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_H_
+
+#include <string>
+#include <gnuradio/msg_queue.h>
+#include "tracking_interface.h"
+#include "gps_l1_ca_dll_pll_tracking_gpu_cc.h"
+
+
+class ConfigurationInterface;
+
+/*!
+ * \brief This class implements a code DLL + carrier PLL tracking loop using GPU accelerated functions
+ */
+class GpsL1CaDllPllTrackingGPU : public TrackingInterface
+{
+public:
+
+  GpsL1CaDllPllTrackingGPU(ConfigurationInterface* configuration,
+            std::string role,
+            unsigned int in_streams,
+            unsigned int out_streams,
+            boost::shared_ptr<gr::msg_queue> queue);
+
+    virtual ~GpsL1CaDllPllTrackingGPU();
+
+    std::string role()
+    {
+        return role_;
+    }
+
+    //! Returns "GPS_L1_CA_DLL_PLL_Tracking"
+    std::string implementation()
+    {
+        return "GPS_L1_CA_DLL_PLL_Tracking_GPU";
+    }
+    size_t item_size()
+    {
+        return item_size_;
+    }
+
+    void connect(gr::top_block_sptr top_block);
+    void disconnect(gr::top_block_sptr top_block);
+    gr::basic_block_sptr get_left_block();
+    gr::basic_block_sptr get_right_block();
+
+
+    /*!
+     * \brief Set tracking channel unique ID
+     */
+    void set_channel(unsigned int channel);
+
+    /*!
+     * \brief Set acquisition/tracking common Gnss_Synchro object pointer
+     * to efficiently exchange synchronization data between acquisition and tracking blocks
+     */
+    void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro);
+
+    /*!
+     * \brief Set tracking channel internal queue
+     */
+    void set_channel_queue(concurrent_queue<int> *channel_internal_queue);
+
+    void start_tracking();
+
+private:
+    gps_l1_ca_dll_pll_tracking_gpu_cc_sptr tracking_;
+    size_t item_size_;
+    unsigned int channel_;
+    std::string role_;
+    unsigned int in_streams_;
+    unsigned int out_streams_;
+    boost::shared_ptr<gr::msg_queue> queue_;
+    concurrent_queue<int> *channel_internal_queue_;
+};
+
+#endif // GNSS_SDR_GPS_L1_CA_DLL_PLL_TRACKING_GPU_H_
--- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
+++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt
@@ -57,7 +57,8 @@ endif(ENABLE_GENERIC_ARCH)
 file(GLOB TRACKING_GR_BLOCKS_HEADERS "*.h")
 add_library(tracking_gr_blocks ${TRACKING_GR_BLOCKS_SOURCES} ${TRACKING_GR_BLOCKS_HEADERS})
 source_group(Headers FILES ${TRACKING_GR_BLOCKS_HEADERS})
-target_link_libraries(tracking_gr_blocks tracking_lib ${GNURADIO_RUNTIME_LIBRARIES} gnss_sp_libs ${Boost_LIBRARIES} ${VOLK_GNSSSDR_LIBRARIES} ${ORC_LIBRARIES} )
+
+target_link_libraries(tracking_gr_blocks tracking_lib ${GNURADIO_RUNTIME_LIBRARIES} gnss_sp_libs ${Boost_LIBRARIES} ${VOLK_GNSSSDR_LIBRARIES} ${ORC_LIBRARIES} ${CUDA_LIBRARIES})
 if(NOT VOLK_GNSSSDR_FOUND)
    add_dependencies(tracking_gr_blocks volk_gnsssdr_module)
 endif(NOT VOLK_GNSSSDR_FOUND)
--- a/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
+++ b/src/algorithms/tracking/gnuradio_blocks/gps_l1_ca_dll_pll_tracking_gpu_cc.cc
@@ -48,17 +48,11 @@
 #include "GPS_L1_CA.h"
 #include "control_message_factory.h"
 #include <volk/volk.h> //volk_alignement
-
-#include <cuda.h>
-// CUDA runtime
-#include <cuda_runtime.h>
 // includes
 #include <cuda_profiler_api.h>
 #include <helper_functions.h>  // helper for shared functions common to CUDA Samples
 #include <helper_cuda.h>       // helper functions for CUDA error checking and initialization

-
-
 /*!
 * \todo Include in definition header file
 */
@@ -130,8 +124,9 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc(


    multicorrelator_gpu = new cuda_multicorrelator();
-    // Get space for the resampled early / prompt / late local replicas
    int N_CORRELATORS=3;
+    multicorrelator_gpu->init_cuda(0, NULL, 2 * d_vector_length , 2 * d_vector_length , N_CORRELATORS);
+    // Get space for the resampled early / prompt / late local replicas
 	checkCudaErrors(cudaHostAlloc((void**)&d_local_code_shift_samples, N_CORRELATORS * sizeof(int),  cudaHostAllocMapped ));


@@ -323,9 +318,6 @@ Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::~Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc()
 	multicorrelator_gpu->free_cuda();
 	delete(multicorrelator_gpu);

-    volk_free(d_Early);
-    volk_free(d_Prompt);
-    volk_free(d_Late);
    volk_free(d_ca_code);

    delete[] d_Prompt_buffer;
@@ -381,18 +373,20 @@ int Gps_L1_Ca_Dll_Pll_Tracking_GPU_cc::general_work (int noutput_items, gr_vecto

            // UPDATE NCO COMMAND
            float phase_step_rad = static_cast<float>(GPS_TWO_PI) * d_carrier_doppler_hz / static_cast<float>(d_fs_in);
-
+            //std::cout<<"d_current_prn_length_samples="<<d_current_prn_length_samples<<std::endl;
            // perform carrier wipe-off and compute Early, Prompt and Late correlation
+        	cudaProfilerStart();
            multicorrelator_gpu->Carrier_wipeoff_multicorrelator_cuda(
    				d_corr_outs_gpu,
-    				in,//in_gpu,
+    				in,
    				d_local_codes_gpu,
    				d_rem_carr_phase_rad,
    				phase_step_rad,
    				d_local_code_shift_samples,
    				d_current_prn_length_samples,
    				3);
-
+        	cudaProfilerStop();
+            //std::cout<<"d_Prompt="<<*d_Prompt<<"d_Early="<<*d_Early<<"d_Late="<<*d_Late<<std::endl;
            // check for samples consistency (this should be done before in the receiver / here only if the source is a file)
            if (std::isnan((*d_Prompt).real()) == true or std::isnan((*d_Prompt).imag()) == true ) // or std::isinf(in[i].real())==true or std::isinf(in[i].imag())==true)
                {
--- a/src/algorithms/tracking/libs/CMakeLists.txt
+++ b/src/algorithms/tracking/libs/CMakeLists.txt
@@ -21,9 +21,9 @@ if(ENABLE_CUDA)
 	FIND_PACKAGE(CUDA REQUIRED)
 	
 	# Append current NVCC flags by something, eg comput capability
-	# set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30 --default-stream-per-thread)
+	# set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} --gpu-architecture sm_30)
 	
-	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_30,code=sm_30; -std=c++11;-O3; -use_fast_math")
+	list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_30,code=sm_30; -std=c++11;-O3; -use_fast_math -default-stream per-thread")
 	SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
 	
 	CUDA_INCLUDE_DIRECTORIES(
@@ -33,8 +33,6 @@ if(ENABLE_CUDA)

 	SET(LIB_TYPE STATIC) #set the lib type
 	CUDA_ADD_LIBRARY(CUDA_CORRELATOR_LIB ${LIB_TYPE} cuda_multicorrelator.h cuda_multicorrelator.cu)
-
-	set(OPT_LIBRARIES ${OPT_LIBRARIES} ${CUDA_CORRELATOR_LIB})
    
 endif(ENABLE_CUDA)

@@ -71,4 +69,4 @@ endif(SSE3_AVAILABLE)
 file(GLOB TRACKING_LIB_HEADERS "*.h")
 add_library(tracking_lib ${TRACKING_LIB_SOURCES} ${TRACKING_LIB_HEADERS})
 source_group(Headers FILES ${TRACKING_LIB_HEADERS})
-target_link_libraries(tracking_lib ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES} ${OPT_LIBRARIES})
+target_link_libraries(tracking_lib CUDA_CORRELATOR_LIB ${VOLK_LIBRARIES} ${GNURADIO_RUNTIME_LIBRARIES})
--- a/src/algorithms/tracking/libs/cuda_multicorrelator.cu
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.cu
@@ -261,7 +261,7 @@ CUDA_32fc_x2_add_32fc(  GPU_Complex *A,   GPU_Complex  *B, GPU_Complex  *C, int
 }


-bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators)
+bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators)
 {
 	// use command-line specified CUDA device, otherwise use device with highest Gflops/s
 	findCudaDevice(argc, (const char **)argv);
@@ -298,7 +298,8 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign

 	// new version: only one vector with extra samples to shift the local code for the correlator set
 	// Required: The last correlator tap in d_shifts_samples has the largest sample shift
-	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size+sizeof(GPU_Complex)*shifts_samples[n_correlators-1]));
+    size_t size_local_code_bytes = local_codes_length_samples * sizeof(GPU_Complex);
+	checkCudaErrors(cudaMalloc((void **)&d_local_codes_in, size_local_code_bytes));
 	checkCudaErrors(cudaMalloc((void **)&d_shifts_samples, size+sizeof(int)*n_correlators));

 	//scalars
@@ -308,6 +309,8 @@ bool cuda_multicorrelator::init_cuda(const int argc, const char **argv, int sign
 	threadsPerBlock = 256;
    blocksPerGrid =(int)(signal_length_samples+threadsPerBlock-1)/threadsPerBlock;

+	cudaStreamCreate (&stream1) ;
+	cudaStreamCreate (&stream2) ;
 	return true;
 }

@@ -323,14 +326,10 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
 		int n_correlators)
 	{

-	cudaStream_t stream1;
-	cudaStream_t stream2;
-	cudaStreamCreate ( &stream1) ;
-	cudaStreamCreate ( &stream2) ;
-
 	size_t memSize = signal_length_samples * sizeof(std::complex<float>);

 	// input signal CPU -> GPU copy memory
+
    checkCudaErrors(cudaMemcpyAsync(d_sig_in, sig_in, memSize,
                                    cudaMemcpyHostToDevice, stream1));

@@ -362,7 +361,9 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
    //printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);

    //wait for Doppler wipeoff end...
-    checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+    checkCudaErrors(cudaStreamSynchronize(stream2));
+    //checkCudaErrors(cudaDeviceSynchronize());

    //old
 //    scalarProdGPUCPXxN<<<blocksPerGrid, threadsPerBlock,0 ,stream2>>>(
@@ -385,16 +386,13 @@ bool cuda_multicorrelator::Carrier_wipeoff_multicorrelator_cuda(
 		);
    checkCudaErrors(cudaGetLastError());
    //wait for correlators end...
-    checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaStreamSynchronize(stream2));
    // Copy the device result vector in device memory to the host result vector
    // in host memory.

    //scalar products (correlators outputs)
-    checkCudaErrors(cudaMemcpyAsync(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
-            cudaMemcpyDeviceToHost, 0));
-
-	cudaStreamDestroy(stream1) ;
-	cudaStreamDestroy(stream2) ;
+    checkCudaErrors(cudaMemcpy(corr_out, d_corr_out, sizeof(std::complex<float>)*n_correlators,
+            cudaMemcpyDeviceToHost));
    return true;
 }

@@ -406,13 +404,16 @@ bool cuda_multicorrelator::free_cuda()
 	cudaFree(d_local_codes_in);
 	cudaFree(d_corr_out);

+	cudaStreamDestroy(stream1) ;
+	cudaStreamDestroy(stream2) ;
+
    // Reset the device and exit
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
-	checkCudaErrors(cudaDeviceReset());
+	//checkCudaErrors(cudaDeviceReset());
 	return true;
 }

--- a/src/algorithms/tracking/libs/cuda_multicorrelator.h
+++ b/src/algorithms/tracking/libs/cuda_multicorrelator.h
@@ -46,6 +46,10 @@

 #include <complex>

+#include <cuda.h>
+// CUDA runtime
+#include <cuda_runtime.h>
+
 // GPU new internal data types for complex numbers

 struct GPU_Complex {
@@ -109,7 +113,7 @@ struct GPU_Complex_Short {
 class cuda_multicorrelator
 {
 public:
-	bool init_cuda(const int argc, const char **argv, int signal_length_samples, int *shifts_samples, int n_correlators);
+	bool init_cuda(const int argc, const char **argv, int signal_length_samples, int local_codes_length_samples, int n_correlators);

 	bool free_cuda();
 	bool Carrier_wipeoff_multicorrelator_cuda(
@@ -132,6 +136,9 @@ private:
 	int threadsPerBlock;
 	int blocksPerGrid;

+	cudaStream_t stream1;
+	cudaStream_t stream2;
+
 };