diff --git a/CMakeLists.txt b/CMakeLists.txt
index a3207b2e3..6040211a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,6 +71,11 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
endif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+########################################################################
+# Find OpenCL installation
+########################################################################
+find_package(OpenCL)
+
################################################################################
# Googletest - http://code.google.com/p/googletest/
@@ -520,3 +525,4 @@ add_custom_target(uninstall
########################################################################
add_subdirectory(src)
+
diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake
new file mode 100644
index 000000000..1229090a9
--- /dev/null
+++ b/cmake/Modules/FindOpenCL.cmake
@@ -0,0 +1,99 @@
+#
+# This file taken from FindOpenCL project @ http://gitorious.com/findopencl
+#
+# - Try to find OpenCL
+# This module tries to find an OpenCL implementation on your system. It supports
+# AMD / ATI, Apple and NVIDIA implementations, but shoudl work, too.
+#
+# Once done this will define
+# OPENCL_FOUND - system has OpenCL
+# OPENCL_INCLUDE_DIRS - the OpenCL include directory
+# OPENCL_LIBRARIES - link these to use OpenCL
+#
+# WIN32 should work, but is untested
+
+FIND_PACKAGE( PackageHandleStandardArgs )
+
+SET (OPENCL_VERSION_STRING "0.1.0")
+SET (OPENCL_VERSION_MAJOR 0)
+SET (OPENCL_VERSION_MINOR 1)
+SET (OPENCL_VERSION_PATCH 0)
+
+IF (APPLE)
+
+ FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX")
+ FIND_PATH(OPENCL_INCLUDE_DIRS OpenCL/cl.h DOC "Include for OpenCL on OSX")
+ FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS OpenCL/cl.hpp DOC "Include for OpenCL CPP bindings on OSX")
+
+ELSE (APPLE)
+
+ IF (WIN32)
+
+ FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h)
+ FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp)
+
+ # The AMD SDK currently installs both x86 and x86_64 libraries
+ # This is only a hack to find out architecture
+ IF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" )
+ SET(OPENCL_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86_64")
+ SET(OPENCL_LIB_DIR "$ENV{ATIINTERNALSTREAMSDKROOT}/lib/x86_64")
+ ELSE (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
+ SET(OPENCL_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86")
+ SET(OPENCL_LIB_DIR "$ENV{ATIINTERNALSTREAMSDKROOT}/lib/x86")
+ ENDIF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" )
+
+ # find out if the user asked for a 64-bit build, and use the corresponding
+ # 64 or 32 bit NVIDIA library paths to the search:
+ STRING(REGEX MATCH "Win64" ISWIN64 ${CMAKE_GENERATOR})
+ IF("${ISWIN64}" STREQUAL "Win64")
+ FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/x64)
+ ELSE("${ISWIN64}" STREQUAL "Win64")
+ FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/Win32)
+ ENDIF("${ISWIN64}" STREQUAL "Win64")
+
+ GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
+
+ # On Win32 search relative to the library
+ FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include)
+ FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include)
+
+ ELSE (WIN32)
+
+ # Unix style platforms
+ FIND_LIBRARY(OPENCL_LIBRARIES OpenCL
+ ENV LD_LIBRARY_PATH
+ )
+
+ GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH)
+ GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
+
+ # The AMD SDK currently does not place its headers
+ # in /usr/include, therefore also search relative
+ # to the library
+ FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include")
+ FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include")
+
+ ENDIF (WIN32)
+
+ENDIF (APPLE)
+
+FIND_PACKAGE_HANDLE_STANDARD_ARGS( OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )
+
+IF( _OPENCL_CPP_INCLUDE_DIRS )
+ SET( OPENCL_HAS_CPP_BINDINGS TRUE )
+ LIST( APPEND OPENCL_INCLUDE_DIRS ${_OPENCL_CPP_INCLUDE_DIRS} )
+ # This is often the same, so clean up
+ LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS )
+ENDIF( _OPENCL_CPP_INCLUDE_DIRS )
+
+MARK_AS_ADVANCED(
+ OPENCL_INCLUDE_DIRS
+)
+
+IF( OPENCL_INCLUDE_DIRS AND OPENCL_LIBRARIES )
+ SET( OPENCL_FOUND TRUE )
+ add_definitions( -DOPENCL=1 )
+ELSE( OPENCL_INCLUDE_DIRS AND OPENCL_LIBRARIES )
+ SET( OPENCL_FOUND FALSE )
+ add_definitions( -DOPENCL=0 )
+ENDIF( OPENCL_INCLUDE_DIRS AND OPENCL_LIBRARIES )
diff --git a/install/math_kernel.cl b/install/math_kernel.cl
new file mode 100644
index 000000000..3b1601877
--- /dev/null
+++ b/install/math_kernel.cl
@@ -0,0 +1,29 @@
+#define MUL_RE(a,b) (a.x*b.x - a.y*b.y)
+#define MUL_IM(a,b) (a.x*b.y + a.y*b.x)
+#define SUM_RE(a,b) (a.x + b.x)
+#define SUM_IM(a,b) (a.y + b.y)
+
+__kernel void add_vectors(__global const float2* src1, __global const float2* src2, __global float2* dest)
+{
+ int gid = get_global_id(0);
+ dest[gid] = (float2)(SUM_RE(src1[gid],src2[gid]),SUM_IM(src1[gid],src2[gid]));
+}
+
+__kernel void mult_vectors(__global const float2* src1, __global const float2* src2, __global float2* dest)
+{
+ int gid = get_global_id(0);
+ dest[gid] = (float2)(MUL_RE(src1[gid],src2[gid]),MUL_IM(src1[gid],src2[gid]));
+}
+
+__kernel void conj_vector(__global const float2* src, __global float2* dest)
+{
+ int gid = get_global_id(0);
+ dest[gid] = ((float2)(1,-1)) * src[gid];
+}
+
+__kernel void magnitude_squared(__global const float2* src, __global float* dest)
+{
+ int gid = get_global_id(0);
+ dest[gid] = src[gid].x*src[gid].x + src[gid].y*src[gid].y;
+}
+
diff --git a/src/algorithms/acquisition/adapters/CMakeLists.txt b/src/algorithms/acquisition/adapters/CMakeLists.txt
index 631ad5f78..097f608c4 100644
--- a/src/algorithms/acquisition/adapters/CMakeLists.txt
+++ b/src/algorithms/acquisition/adapters/CMakeLists.txt
@@ -16,17 +16,32 @@
# along with GNSS-SDR. If not, see .
#
-set(ACQ_ADAPTER_SOURCES
- gps_l1_ca_pcps_acquisition.cc
- gps_l1_ca_pcps_multithread_acquisition.cc
- gps_l1_ca_pcps_assisted_acquisition.cc
- gps_l1_ca_pcps_acquisition_fine_doppler.cc
- gps_l1_ca_pcps_tong_acquisition.cc
- galileo_e1_pcps_ambiguous_acquisition.cc
- galileo_e1_pcps_cccwsr_ambiguous_acquisition.cc
- galileo_e1_pcps_tong_ambiguous_acquisition.cc
- galileo_e1_pcps_8ms_ambiguous_acquisition.cc
-)
+if(OPENCL_FOUND)
+ set(ACQ_ADAPTER_SOURCES
+ gps_l1_ca_pcps_acquisition.cc
+ gps_l1_ca_pcps_multithread_acquisition.cc
+ gps_l1_ca_pcps_assisted_acquisition.cc
+ gps_l1_ca_pcps_acquisition_fine_doppler.cc
+ gps_l1_ca_pcps_tong_acquisition.cc
+ gps_l1_ca_pcps_opencl_acquisition.cc
+ galileo_e1_pcps_ambiguous_acquisition.cc
+ galileo_e1_pcps_cccwsr_ambiguous_acquisition.cc
+ galileo_e1_pcps_tong_ambiguous_acquisition.cc
+ galileo_e1_pcps_8ms_ambiguous_acquisition.cc
+ )
+else(OPENCL_FOUND)
+ set(ACQ_ADAPTER_SOURCES
+ gps_l1_ca_pcps_acquisition.cc
+ gps_l1_ca_pcps_multithread_acquisition.cc
+ gps_l1_ca_pcps_assisted_acquisition.cc
+ gps_l1_ca_pcps_acquisition_fine_doppler.cc
+ gps_l1_ca_pcps_tong_acquisition.cc
+ galileo_e1_pcps_ambiguous_acquisition.cc
+ galileo_e1_pcps_cccwsr_ambiguous_acquisition.cc
+ galileo_e1_pcps_tong_ambiguous_acquisition.cc
+ galileo_e1_pcps_8ms_ambiguous_acquisition.cc
+ )
+endif(OPENCL_FOUND)
include_directories(
$(CMAKE_CURRENT_SOURCE_DIR)
diff --git a/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_multithread_acquisition.h b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_multithread_acquisition.h
index cafc3c693..1738d28dc 100644
--- a/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_multithread_acquisition.h
+++ b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_multithread_acquisition.h
@@ -29,8 +29,8 @@
* -------------------------------------------------------------------------
*/
-#ifndef GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_CQUISITION_H_
-#define GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_CQUISITION_H_
+#ifndef GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_ACQUISITION_H_
+#define GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_ACQUISITION_H_
#include "gnss_synchro.h"
#include "acquisition_interface.h"
@@ -159,4 +159,4 @@ private:
float calculate_threshold(float pfa);
};
-#endif /* GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_CQUISITION_H_ */
+#endif /* GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_ACQUISITION_H_ */
diff --git a/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.cc b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.cc
new file mode 100644
index 000000000..a39b68f35
--- /dev/null
+++ b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.cc
@@ -0,0 +1,296 @@
+/*!
+ * \file gps_l1_ca_pcps_opencl_acquisition.cc
+ * \brief Adapts an OpenCL PCPS acquisition block to an
+ * AcquisitionInterface for GPS L1 C/A signals
+ * \author Marc Molina, 2013. marc.molina.pena(at)gmail.com
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see .
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include "gps_l1_ca_pcps_opencl_acquisition.h"
+#include "gps_sdr_signal_processing.h"
+#include "GPS_L1_CA.h"
+#include "configuration_interface.h"
+#include
+#include
+#include
+#include
+#include
+#include
+
+using google::LogMessage;
+
+GpsL1CaPcpsOpenClAcquisition::GpsL1CaPcpsOpenClAcquisition(
+ ConfigurationInterface* configuration, std::string role,
+ unsigned int in_streams, unsigned int out_streams,
+ gr::msg_queue::sptr queue) :
+ role_(role), in_streams_(in_streams), out_streams_(out_streams), queue_(queue)
+{
+ configuration_ = configuration;
+ std::string default_item_type = "gr_complex";
+ std::string default_dump_filename = "./data/acquisition.dat";
+
+ DLOG(INFO) << "role " << role;
+
+ item_type_ = configuration_->property(role + ".item_type",
+ default_item_type);
+
+ fs_in_ = configuration_->property("GNSS-SDR.internal_fs_hz", 2048000);
+ if_ = configuration_->property(role + ".ifreq", 0);
+ dump_ = configuration_->property(role + ".dump", false);
+ shift_resolution_ = configuration_->property(role + ".doppler_max", 15);
+ sampled_ms_ = configuration_->property(role + ".coherent_integration_time_ms", 1);
+
+ bit_transition_flag_ = configuration_->property("Acquisition.bit_transition_flag", false);
+
+ if (!bit_transition_flag_)
+ {
+ max_dwells_ = configuration_->property(role + ".max_dwells", 1);
+ }
+ else
+ {
+ max_dwells_ = 2;
+ }
+
+ dump_filename_ = configuration_->property(role + ".dump_filename",
+ default_dump_filename);
+
+ //--- Find number of samples per spreading code -------------------------
+ code_length_ = round(fs_in_
+ / (GPS_L1_CA_CODE_RATE_HZ / GPS_L1_CA_CODE_LENGTH_CHIPS));
+
+ vector_length_ = code_length_ * sampled_ms_;
+
+ code_= new gr_complex[vector_length_];
+
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ item_size_ = sizeof(gr_complex);
+ acquisition_cc_ = pcps_make_opencl_acquisition_cc(sampled_ms_, max_dwells_,
+ shift_resolution_, if_, fs_in_, code_length_, code_length_,
+ bit_transition_flag_, queue_, dump_, dump_filename_);
+
+ stream_to_vector_ = gr::blocks::stream_to_vector::make(item_size_, vector_length_);
+
+ DLOG(INFO) << "stream_to_vector(" << stream_to_vector_->unique_id()
+ << ")";
+ DLOG(INFO) << "acquisition(" << acquisition_cc_->unique_id()
+ << ")";
+ }
+ else
+ {
+ LOG_AT_LEVEL(WARNING) << item_type_
+ << " unknown acquisition item type";
+ }
+}
+
+
+GpsL1CaPcpsOpenClAcquisition::~GpsL1CaPcpsOpenClAcquisition()
+{
+ delete[] code_;
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::set_channel(unsigned int channel)
+{
+ channel_ = channel;
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ acquisition_cc_->set_channel(channel_);
+ }
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::set_threshold(float threshold)
+{
+ float pfa = configuration_->property(role_ + boost::lexical_cast(channel_) + ".pfa", 0.0);
+
+ if(pfa==0.0)
+ {
+ pfa = configuration_->property(role_+".pfa", 0.0);
+ }
+ if(pfa==0.0)
+ {
+ threshold_ = threshold;
+ }
+ else
+ {
+ threshold_ = calculate_threshold(pfa);
+ }
+
+ DLOG(INFO) <<"Channel "<set_threshold(threshold_);
+ }
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::set_doppler_max(unsigned int doppler_max)
+{
+ doppler_max_ = doppler_max;
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ acquisition_cc_->set_doppler_max(doppler_max_);
+ }
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::set_doppler_step(unsigned int doppler_step)
+{
+ doppler_step_ = doppler_step;
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ acquisition_cc_->set_doppler_step(doppler_step_);
+ }
+
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::set_channel_queue(
+ concurrent_queue *channel_internal_queue)
+{
+ channel_internal_queue_ = channel_internal_queue;
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ acquisition_cc_->set_channel_queue(channel_internal_queue_);
+ }
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::set_gnss_synchro(Gnss_Synchro* gnss_synchro)
+{
+ gnss_synchro_ = gnss_synchro;
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ acquisition_cc_->set_gnss_synchro(gnss_synchro_);
+ }
+}
+
+
+signed int GpsL1CaPcpsOpenClAcquisition::mag()
+{
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ return acquisition_cc_->mag();
+ }
+ else
+ {
+ return 0;
+ }
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::init()
+{
+ acquisition_cc_->init();
+ set_local_code();
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::set_local_code()
+{
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ std::complex* code = new std::complex[code_length_];
+
+ gps_l1_ca_code_gen_complex_sampled(code, gnss_synchro_->PRN, fs_in_, 0);
+
+ for (unsigned int i = 0; i < sampled_ms_; i++)
+ {
+ memcpy(&(code_[i*code_length_]), code,
+ sizeof(gr_complex)*code_length_);
+ }
+
+ acquisition_cc_->set_local_code(code_);
+
+ delete[] code;
+ }
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::reset()
+{
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ acquisition_cc_->set_active(true);
+ }
+}
+
+
+float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa)
+{
+ //Calculate the threshold
+
+ unsigned int frequency_bins = 0;
+ for (int doppler = (int)(-doppler_max_); doppler <= (int)doppler_max_; doppler += doppler_step_)
+ {
+ frequency_bins++;
+ }
+
+ DLOG(INFO) <<"Channel "<connect(stream_to_vector_, 0, acquisition_cc_, 0);
+ }
+
+}
+
+
+void GpsL1CaPcpsOpenClAcquisition::disconnect(gr::top_block_sptr top_block)
+{
+ if (item_type_.compare("gr_complex") == 0)
+ {
+ top_block->disconnect(stream_to_vector_, 0, acquisition_cc_, 0);
+ }
+}
+
+
+gr::basic_block_sptr GpsL1CaPcpsOpenClAcquisition::get_left_block()
+{
+ return stream_to_vector_;
+}
+
+
+gr::basic_block_sptr GpsL1CaPcpsOpenClAcquisition::get_right_block()
+{
+ return acquisition_cc_;
+}
+
diff --git a/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.h b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.h
new file mode 100644
index 000000000..c76813edd
--- /dev/null
+++ b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.h
@@ -0,0 +1,162 @@
+/*!
+ * \file gps_l1_ca_pcps_opencl_acquisition.h
+ * \brief Adapts an OpenCL PCPS acquisition block to an
+ * AcquisitionInterface for GPS L1 C/A signals
+ * \author Marc Molina, 2013. marc.molina.pena(at)gmail.com
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see .
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef GNSS_SDR_GPS_L1_CA_PCPS_OPENCL_ACQUISITION_H_
+#define GNSS_SDR_GPS_L1_CA_PCPS_OPENCL_ACQUISITION_H_
+
+#include "gnss_synchro.h"
+#include "acquisition_interface.h"
+#include "pcps_opencl_acquisition_cc.h"
+#include
+#include
+
+
+class ConfigurationInterface;
+
+/*!
+ * \brief This class adapts an OpenCL PCPS acquisition block to an
+ * AcquisitionInterface for GPS L1 C/A signals
+ */
+class GpsL1CaPcpsOpenClAcquisition: public AcquisitionInterface
+{
+public:
+ GpsL1CaPcpsOpenClAcquisition(ConfigurationInterface* configuration,
+ std::string role, unsigned int in_streams,
+ unsigned int out_streams, boost::shared_ptr queue);
+
+ virtual ~GpsL1CaPcpsOpenClAcquisition();
+
+ std::string role()
+ {
+ return role_;
+ }
+
+ /*!
+ * \brief Returns "GPS_L1_CA_PCPS_OpenCl_Acquisition"
+ */
+ std::string implementation()
+ {
+ return "GPS_L1_CA_PCPS_OpenCl_Acquisition";
+ }
+ size_t item_size()
+ {
+ return item_size_;
+ }
+
+ void connect(gr::top_block_sptr top_block);
+ void disconnect(gr::top_block_sptr top_block);
+ gr::basic_block_sptr get_left_block();
+ gr::basic_block_sptr get_right_block();
+
+ /*!
+ * \brief Set acquisition/tracking common Gnss_Synchro object pointer
+ * to efficiently exchange synchronization data between acquisition and
+ * tracking blocks
+ */
+ void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro);
+
+ /*!
+ * \brief Set acquisition channel unique ID
+ */
+ void set_channel(unsigned int channel);
+
+ /*!
+ * \brief Set statistics threshold of PCPS algorithm
+ */
+ void set_threshold(float threshold);
+
+ /*!
+ * \brief Set maximum Doppler off grid search
+ */
+ void set_doppler_max(unsigned int doppler_max);
+
+ /*!
+ * \brief Set Doppler steps for the grid search
+ */
+ void set_doppler_step(unsigned int doppler_step);
+
+ /*!
+ * \brief Set tracking channel internal queue
+ */
+ void set_channel_queue(concurrent_queue *channel_internal_queue);
+
+ /*!
+ * \brief Initializes acquisition algorithm.
+ */
+ void init();
+
+ /*!
+ * \brief Sets local code for GPS L1/CA PCPS acquisition algorithm.
+ */
+ void set_local_code();
+
+ /*!
+ * \brief Returns the maximum peak of grid search
+ */
+ signed int mag();
+
+ /*!
+ * \brief Restart acquisition algorithm
+ */
+ void reset();
+
+private:
+ ConfigurationInterface* configuration_;
+ pcps_opencl_acquisition_cc_sptr acquisition_cc_;
+ gr::blocks::stream_to_vector::sptr stream_to_vector_;
+ size_t item_size_;
+ std::string item_type_;
+ unsigned int vector_length_;
+ unsigned int code_length_;
+ bool bit_transition_flag_;
+ unsigned int channel_;
+ float threshold_;
+ unsigned int doppler_max_;
+ unsigned int doppler_step_;
+ unsigned int shift_resolution_;
+ unsigned int sampled_ms_;
+ unsigned int max_dwells_;
+ long fs_in_;
+ long if_;
+ bool dump_;
+ std::string dump_filename_;
+ std::complex * code_;
+ Gnss_Synchro * gnss_synchro_;
+ std::string role_;
+ unsigned int in_streams_;
+ unsigned int out_streams_;
+ boost::shared_ptr queue_;
+ concurrent_queue *channel_internal_queue_;
+
+ float calculate_threshold(float pfa);
+};
+
+#endif /* GNSS_SDR_GPS_L1_CA_PCPS_OPENCL_ACQUISITION_H_ */
diff --git a/src/algorithms/acquisition/gnuradio_blocks/CMakeLists.txt b/src/algorithms/acquisition/gnuradio_blocks/CMakeLists.txt
index 6ea1114ee..113f72e7b 100644
--- a/src/algorithms/acquisition/gnuradio_blocks/CMakeLists.txt
+++ b/src/algorithms/acquisition/gnuradio_blocks/CMakeLists.txt
@@ -16,15 +16,28 @@
# along with GNSS-SDR. If not, see .
#
-set(ACQ_GR_BLOCKS_SOURCES
- pcps_acquisition_cc.cc
- pcps_multithread_acquisition_cc.cc
- pcps_assisted_acquisition_cc.cc
- pcps_acquisition_fine_doppler_cc.cc
- pcps_tong_acquisition_cc.cc
- pcps_cccwsr_acquisition_cc.cc
- galileo_pcps_8ms_acquisition_cc.cc
-)
+if(OPENCL_FOUND)
+ set(ACQ_GR_BLOCKS_SOURCES
+ pcps_acquisition_cc.cc
+ pcps_multithread_acquisition_cc.cc
+ pcps_assisted_acquisition_cc.cc
+ pcps_acquisition_fine_doppler_cc.cc
+ pcps_tong_acquisition_cc.cc
+ pcps_cccwsr_acquisition_cc.cc
+ galileo_pcps_8ms_acquisition_cc.cc
+ pcps_opencl_acquisition_cc.cc # Needs OpenCL
+ )
+else(OPENCL_FOUND)
+ set(ACQ_GR_BLOCKS_SOURCES
+ pcps_acquisition_cc.cc
+ pcps_multithread_acquisition_cc.cc
+ pcps_assisted_acquisition_cc.cc
+ pcps_acquisition_fine_doppler_cc.cc
+ pcps_tong_acquisition_cc.cc
+ pcps_cccwsr_acquisition_cc.cc
+ galileo_pcps_8ms_acquisition_cc.cc
+ )
+endif(OPENCL_FOUND)
include_directories(
$(CMAKE_CURRENT_SOURCE_DIR)
@@ -37,6 +50,11 @@ include_directories(
${GNURADIO_RUNTIME_INCLUDE_DIRS}
)
-add_library(acq_gr_blocks ${ACQ_GR_BLOCKS_SOURCES})
-target_link_libraries(acq_gr_blocks gnss_system_parameters ${GNURADIO_RUNTIME_LIBRARIES} ${GNURADIO_FFT_LIBRARIES} ${VOLK_LIBRARIES})
+if(OPENCL_FOUND)
+ include_directories( ${OPENCL_INCLUDE_DIRS} )
+ set(OPT_LIBRARIES ${OPT_LIBRARIES} ${OPENCL_LIBRARIES})
+endif(OPENCL_FOUND)
+
+add_library(acq_gr_blocks ${ACQ_GR_BLOCKS_SOURCES})
+target_link_libraries(acq_gr_blocks gnss_sp_libs gnss_system_parameters ${GNURADIO_RUNTIME_LIBRARIES} ${GNURADIO_FFT_LIBRARIES} ${VOLK_LIBRARIES} ${OPT_LIBRARIES})
diff --git a/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc
index d43abe9ff..2ac066a96 100644
--- a/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc
@@ -53,7 +53,6 @@ galileo_pcps_8ms_acquisition_cc_sptr galileo_pcps_8ms_make_acquisition_cc(
samples_per_code, queue, dump, dump_filename));
}
-
galileo_pcps_8ms_acquisition_cc::galileo_pcps_8ms_acquisition_cc(
unsigned int sampled_ms, unsigned int max_dwells,
unsigned int doppler_max, long freq, long fs_in,
@@ -84,7 +83,7 @@ galileo_pcps_8ms_acquisition_cc::galileo_pcps_8ms_acquisition_cc(
//todo: do something if posix_memalign fails
if (posix_memalign((void**)&d_fft_code_A, 16, d_fft_size * sizeof(gr_complex)) == 0){};
if (posix_memalign((void**)&d_fft_code_B, 16, d_fft_size * sizeof(gr_complex)) == 0){};
- if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(gr_complex)) == 0){};
+ if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(float)) == 0){};
// Direct FFT
d_fft_if = new gr::fft::fft_complex(d_fft_size, true);
@@ -97,18 +96,14 @@ galileo_pcps_8ms_acquisition_cc::galileo_pcps_8ms_acquisition_cc(
d_dump_filename = dump_filename;
}
-
galileo_pcps_8ms_acquisition_cc::~galileo_pcps_8ms_acquisition_cc()
{
-
- for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++)
- {
- free(d_grid_doppler_wipeoffs[doppler_index]);
- }
-
-
if (d_num_doppler_bins > 0)
{
+ for (unsigned int i = 0; i < d_num_doppler_bins; i++)
+ {
+ free(d_grid_doppler_wipeoffs[i]);
+ }
delete[] d_grid_doppler_wipeoffs;
}
@@ -125,10 +120,10 @@ galileo_pcps_8ms_acquisition_cc::~galileo_pcps_8ms_acquisition_cc()
}
}
-
void galileo_pcps_8ms_acquisition_cc::set_local_code(std::complex * code)
{
- memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size);
+ // code A: two replicas of a primary code
+ memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size);
d_fft_if->execute(); // We need the FFT of local code
@@ -142,7 +137,7 @@ void galileo_pcps_8ms_acquisition_cc::set_local_code(std::complex * code)
volk_32fc_conjugate_32fc_a(d_fft_code_A,d_fft_if->get_outbuf(),d_fft_size);
}
-
+ // code B: two replicas of a primary code; the second replica is inverted.
volk_32fc_s32fc_multiply_32fc_a(&(d_fft_if->get_inbuf())[d_samples_per_code],
&code[d_samples_per_code], gr_complex(-1,0),
d_samples_per_code);
@@ -160,7 +155,6 @@ void galileo_pcps_8ms_acquisition_cc::set_local_code(std::complex * code)
}
}
-
void galileo_pcps_8ms_acquisition_cc::init()
{
d_gnss_synchro->Acq_delay_samples = 0.0;
@@ -169,12 +163,16 @@ void galileo_pcps_8ms_acquisition_cc::init()
d_mag = 0.0;
d_input_power = 0.0;
- // Create the carrier Doppler wipeoff signals
- d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step);
- for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step)
+ // Count the number of bins
+ d_num_doppler_bins = 0;
+ for (int doppler = (int)(-d_doppler_max);
+ doppler <= (int)d_doppler_max;
+ doppler += d_doppler_step)
{
d_num_doppler_bins++;
}
+
+ // Create the carrier Doppler wipeoff signals
d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins];
for (unsigned int doppler_index=0;doppler_indexexecute();
// Multiply carrier wiped--off, Fourier transformed incoming signal
- // with the local FFT'd code reference using SIMD operations with VOLK library
+ // with the local FFT'd code A reference using SIMD operations with
+ // VOLK library
volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(),
d_fft_if->get_outbuf(), d_fft_code_A, d_fft_size);
@@ -279,7 +277,8 @@ int galileo_pcps_8ms_acquisition_cc::general_work(int noutput_items,
magt_A = d_magnitude[indext_A] / (fft_normalization_factor * fft_normalization_factor);
// Multiply carrier wiped--off, Fourier transformed incoming signal
- // with the local FFT'd code reference using SIMD operations with VOLK library
+ // with the local FFT'd code B reference using SIMD operations with
+ // VOLK library
volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(),
d_fft_if->get_outbuf(), d_fft_code_B, d_fft_size);
@@ -293,6 +292,7 @@ int galileo_pcps_8ms_acquisition_cc::general_work(int noutput_items,
// Normalize the maximum value to correct the scale factor introduced by FFTW
magt_B = d_magnitude[indext_B] / (fft_normalization_factor * fft_normalization_factor);
+ // Take the greater magnitude
if (magt_A >= magt_B)
{
magt = magt_A;
@@ -336,12 +336,9 @@ int galileo_pcps_8ms_acquisition_cc::general_work(int noutput_items,
{
d_state = 2; // Positive acquisition
}
- else
+ else if (d_well_count == d_max_dwells)
{
- if (d_well_count == d_max_dwells)
- {
- d_state = 3; // Negative acquisition
- }
+ d_state = 3; // Negative acquisition
}
consume_each(1);
diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition_cc.cc
index 596e89884..745673807 100644
--- a/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition_cc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition_cc.cc
@@ -40,6 +40,7 @@
#include
#include
#include
+#include
using google::LogMessage;
@@ -57,7 +58,6 @@ pcps_acquisition_cc_sptr pcps_make_acquisition_cc(
samples_per_code, bit_transition_flag, queue, dump, dump_filename));
}
-
pcps_acquisition_cc::pcps_acquisition_cc(
unsigned int sampled_ms, unsigned int max_dwells,
unsigned int doppler_max, long freq, long fs_in,
@@ -102,18 +102,14 @@ pcps_acquisition_cc::pcps_acquisition_cc(
d_dump_filename = dump_filename;
}
-
pcps_acquisition_cc::~pcps_acquisition_cc()
{
-
- for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++)
- {
- free(d_grid_doppler_wipeoffs[doppler_index]);
- }
-
-
if (d_num_doppler_bins > 0)
{
+ for (unsigned int i = 0; i < d_num_doppler_bins; i++)
+ {
+ free(d_grid_doppler_wipeoffs[i]);
+ }
delete[] d_grid_doppler_wipeoffs;
}
@@ -129,7 +125,6 @@ pcps_acquisition_cc::~pcps_acquisition_cc()
}
}
-
void pcps_acquisition_cc::set_local_code(std::complex * code)
{
memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size);
@@ -147,7 +142,6 @@ void pcps_acquisition_cc::set_local_code(std::complex * code)
}
}
-
void pcps_acquisition_cc::init()
{
d_gnss_synchro->Acq_delay_samples = 0.0;
@@ -156,12 +150,16 @@ void pcps_acquisition_cc::init()
d_mag = 0.0;
d_input_power = 0.0;
- // Create the carrier Doppler wipeoff signals
- d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step);
- for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step)
+ // Count the number of bins
+ d_num_doppler_bins = 0;
+ for (int doppler = (int)(-d_doppler_max);
+ doppler <= (int)d_doppler_max;
+ doppler += d_doppler_step)
{
d_num_doppler_bins++;
}
+
+ // Create the carrier Doppler wipeoff signals
d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins];
for (unsigned int doppler_index=0;doppler_indexAcq_delay_samples = (double)(indext % d_samples_per_code);
d_gnss_synchro->Acq_doppler_hz = (double)doppler;
@@ -321,17 +313,14 @@ int pcps_acquisition_cc::general_work(int noutput_items,
{
d_state = 2; // Positive acquisition
}
- else
+ else if (d_well_count == d_max_dwells)
{
- if (d_well_count == d_max_dwells)
- {
- d_state = 3; // Negative acquisition
- }
+ d_state = 3; // Negative acquisition
}
}
else
{
- if (d_well_count == d_max_dwells)
+ if (d_well_count == d_max_dwells) // d_max_dwells = 2
{
if (d_test_statistics > d_threshold)
{
diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc
index dd4062a79..8e9747308 100644
--- a/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc
@@ -60,7 +60,6 @@ pcps_cccwsr_acquisition_cc_sptr pcps_cccwsr_make_acquisition_cc(
samples_per_ms, samples_per_code, queue, dump, dump_filename));
}
-
pcps_cccwsr_acquisition_cc::pcps_cccwsr_acquisition_cc(
unsigned int sampled_ms, unsigned int max_dwells,
unsigned int doppler_max, long freq, long fs_in,
@@ -108,18 +107,14 @@ pcps_cccwsr_acquisition_cc::pcps_cccwsr_acquisition_cc(
d_dump_filename = dump_filename;
}
-
pcps_cccwsr_acquisition_cc::~pcps_cccwsr_acquisition_cc()
{
-
- for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++)
- {
- free(d_grid_doppler_wipeoffs[doppler_index]);
- }
-
-
if (d_num_doppler_bins > 0)
{
+ for (unsigned int i = 0; i < d_num_doppler_bins; i++)
+ {
+ free(d_grid_doppler_wipeoffs[i]);
+ }
delete[] d_grid_doppler_wipeoffs;
}
@@ -140,10 +135,10 @@ pcps_cccwsr_acquisition_cc::~pcps_cccwsr_acquisition_cc()
}
}
-
void pcps_cccwsr_acquisition_cc::set_local_code(std::complex * code_data,
std::complex * code_pilot)
{
+ // Data code (E1B)
memcpy(d_fft_if->get_inbuf(), code_data, sizeof(gr_complex)*d_fft_size);
d_fft_if->execute(); // We need the FFT of local code
@@ -158,6 +153,7 @@ void pcps_cccwsr_acquisition_cc::set_local_code(std::complex * code_data,
volk_32fc_conjugate_32fc_a(d_fft_code_data,d_fft_if->get_outbuf(),d_fft_size);
}
+ // Pilot code (E1C)
memcpy(d_fft_if->get_inbuf(), code_pilot, sizeof(gr_complex)*d_fft_size);
d_fft_if->execute(); // We need the FFT of local code
@@ -173,7 +169,6 @@ void pcps_cccwsr_acquisition_cc::set_local_code(std::complex * code_data,
}
}
-
void pcps_cccwsr_acquisition_cc::init()
{
d_gnss_synchro->Acq_delay_samples = 0.0;
@@ -182,12 +177,16 @@ void pcps_cccwsr_acquisition_cc::init()
d_mag = 0.0;
d_input_power = 0.0;
- // Create the carrier Doppler wipeoff signals
- d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step);
- for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step)
+ // Count the number of bins
+ d_num_doppler_bins = 0;
+ for (int doppler = (int)(-d_doppler_max);
+ doppler <= (int)d_doppler_max;
+ doppler += d_doppler_step)
{
d_num_doppler_bins++;
}
+
+ // Create the carrier Doppler wipeoff signals
d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins];
for (unsigned int doppler_index=0;doppler_indexexecute();
// Multiply carrier wiped--off, Fourier transformed incoming signal
- // with the local FFT'd code reference {data+j*pilot} using SIMD operations with VOLK library
+ // with the local FFT'd data code reference (E1B) using SIMD operations
+ // with VOLK library
volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(),
d_fft_if->get_outbuf(), d_fft_code_data, d_fft_size);
// compute the inverse FFT
d_ifft->execute();
+ // Copy the result of the correlation between wiped--off signal and data code in
+ // d_data_correlation.
memcpy(d_data_correlation, d_ifft->get_outbuf(), sizeof(gr_complex)*d_fft_size);
-
+ // Multiply carrier wiped--off, Fourier transformed incoming signal
+ // with the local FFT'd pilot code reference (E1C) using SIMD operations
+ // with VOLK library
volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(),
d_fft_if->get_outbuf(), d_fft_code_pilot, d_fft_size);
+ // Compute the inverse FFT
d_ifft->execute();
+ // Copy the result of the correlation between wiped--off signal and pilot code in
+ // d_data_correlation.
memcpy(d_pilot_correlation, d_ifft->get_outbuf(), sizeof(gr_complex)*d_fft_size);
for (unsigned int i = 0; i < d_fft_size; i++)
@@ -354,14 +360,13 @@ int pcps_cccwsr_acquisition_cc::general_work(int noutput_items,
{
d_state = 2; // Positive acquisition
}
- else
+ else if (d_well_count == d_max_dwells)
{
- if (d_well_count == d_max_dwells)
- {
- d_state = 3; // Negative acquisition
- }
+ d_state = 3; // Negative acquisition
}
+ consume_each(1);
+
break;
}
diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.cc
index 79f746b55..430bca07b 100644
--- a/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.cc
@@ -57,7 +57,6 @@ pcps_multithread_acquisition_cc_sptr pcps_make_multithread_acquisition_cc(
samples_per_code, bit_transition_flag, queue, dump, dump_filename));
}
-
pcps_multithread_acquisition_cc::pcps_multithread_acquisition_cc(
unsigned int sampled_ms, unsigned int max_dwells,
unsigned int doppler_max, long freq, long fs_in,
@@ -72,6 +71,7 @@ pcps_multithread_acquisition_cc::pcps_multithread_acquisition_cc(
d_sample_counter = 0; // SAMPLE COUNTER
d_active = false;
d_state = 0;
+ d_core_working = false;
d_queue = queue;
d_freq = freq;
d_fs_in = fs_in;
@@ -86,10 +86,19 @@ pcps_multithread_acquisition_cc::pcps_multithread_acquisition_cc(
d_input_power = 0.0;
d_num_doppler_bins = 0;
d_bit_transition_flag = bit_transition_flag;
+ d_in_dwell_count = 0;
+
+ d_in_buffer = new gr_complex*[d_max_dwells];
//todo: do something if posix_memalign fails
+ for (unsigned int i = 0; i < d_max_dwells; i++)
+ {
+ if (posix_memalign((void**)&d_in_buffer[i], 16,
+ d_fft_size * sizeof(gr_complex)) == 0){};
+
+ }
if (posix_memalign((void**)&d_fft_codes, 16, d_fft_size * sizeof(gr_complex)) == 0){};
- if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(gr_complex)) == 0){};
+ if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(float)) == 0){};
// Direct FFT
d_fft_if = new gr::fft::fft_complex(d_fft_size, true);
@@ -102,21 +111,23 @@ pcps_multithread_acquisition_cc::pcps_multithread_acquisition_cc(
d_dump_filename = dump_filename;
}
-
pcps_multithread_acquisition_cc::~pcps_multithread_acquisition_cc()
{
-
- for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++)
- {
- free(d_grid_doppler_wipeoffs[doppler_index]);
- }
-
-
if (d_num_doppler_bins > 0)
{
+ for (unsigned int i = 0; i < d_num_doppler_bins; i++)
+ {
+ free(d_grid_doppler_wipeoffs[i]);
+ }
delete[] d_grid_doppler_wipeoffs;
}
+ for (unsigned int i = 0; i < d_max_dwells; i++)
+ {
+ free(d_in_buffer[i]);
+ }
+ delete[] d_in_buffer;
+
free(d_fft_codes);
free(d_magnitude);
@@ -129,6 +140,35 @@ pcps_multithread_acquisition_cc::~pcps_multithread_acquisition_cc()
}
}
+void pcps_multithread_acquisition_cc::init()
+{
+ d_gnss_synchro->Acq_delay_samples = 0.0;
+ d_gnss_synchro->Acq_doppler_hz = 0.0;
+ d_gnss_synchro->Acq_samplestamp_samples = 0;
+ d_mag = 0.0;
+ d_input_power = 0.0;
+
+ // Count the number of bins
+ d_num_doppler_bins = 0;
+ for (int doppler = (int)(-d_doppler_max);
+ doppler <= (int)d_doppler_max;
+ doppler += d_doppler_step)
+ {
+ d_num_doppler_bins++;
+ }
+
+ // Create the carrier Doppler wipeoff signals
+ d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins];
+ for (unsigned int doppler_index=0;doppler_index * code)
{
@@ -147,13 +187,16 @@ void pcps_multithread_acquisition_cc::set_local_code(std::complex * code)
}
}
-void pcps_multithread_acquisition_cc::perform_acquisition(const gr_complex* in, unsigned int samplestamp)
+void pcps_multithread_acquisition_cc::acquisition_core()
{
// initialize acquisition algorithm
int doppler;
unsigned int indext = 0;
float magt = 0.0;
float fft_normalization_factor = (float)d_fft_size * (float)d_fft_size;
+ gr_complex* in = d_in_buffer[d_well_count];
+ unsigned long int samplestamp = d_sample_counter_buffer[d_well_count];
+
d_input_power = 0.0;
d_mag = 0.0;
@@ -204,7 +247,14 @@ void pcps_multithread_acquisition_cc::perform_acquisition(const gr_complex* in,
{
d_mag = magt;
- if (d_test_statistics < (magt / d_input_power) || !d_bit_transition_flag)
+ // In case that d_bit_transition_flag = true, we compare the potentially
+ // new maximum test statistics (d_mag/d_input_power) with the value in
+ // d_test_statistics. When the second dwell is being processed, the value
+ // of d_mag/d_input_power could be lower than d_test_statistics (i.e,
+ // the maximum test statistics in the previous dwell is greater than
+ // current d_mag/d_input_power). Note that d_test_statistics is not
+ // restarted between consecutive dwells in multidwell operation.
+ if (d_test_statistics < (d_mag / d_input_power) || !d_bit_transition_flag)
{
d_gnss_synchro->Acq_delay_samples = (double)(indext % d_samples_per_code);
d_gnss_synchro->Acq_doppler_hz = (double)doppler;
@@ -235,68 +285,31 @@ void pcps_multithread_acquisition_cc::perform_acquisition(const gr_complex* in,
{
if (d_test_statistics > d_threshold)
{
- d_state = 3; // Positive acquisition
+ d_state = 2; // Positive acquisition
}
- else
+ else if (d_well_count == d_max_dwells)
{
- if (d_well_count == d_max_dwells)
- {
- d_state = 4; // Negative acquisition
- }
- else
- {
- d_state = 1; // Process next block
- }
+ d_state = 3; // Negative acquisition
}
}
else
{
- if (d_well_count == d_max_dwells)
+ if (d_well_count == d_max_dwells) // d_max_dwells = 2
{
if (d_test_statistics > d_threshold)
{
- d_state = 3; // Positive acquisition
+ d_state = 2; // Positive acquisition
}
else
{
- d_state = 4; // Negative acquisition
+ d_state = 3; // Negative acquisition
}
}
- else
- {
- d_state = 1; // Process next block
- }
}
+
+ d_core_working = false;
}
-
-void pcps_multithread_acquisition_cc::init()
-{
- d_gnss_synchro->Acq_delay_samples = 0.0;
- d_gnss_synchro->Acq_doppler_hz = 0.0;
- d_gnss_synchro->Acq_samplestamp_samples = 0;
- d_mag = 0.0;
- d_input_power = 0.0;
-
- // Create the carrier Doppler wipeoff signals
- d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step);
- for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step)
- {
- d_num_doppler_bins++;
- }
- d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins];
- for (unsigned int doppler_index=0;doppler_index (int)num_dwells)
+ {
+ d_sample_counter += d_fft_size * (ninput_items[0]-num_dwells);
+ }
+ }
+ else
+ {
+ // We already have d_max_dwells consecutive blocks in the internal buffer,
+ // just skip input blocks.
+ d_sample_counter += d_fft_size * ninput_items[0];
+ }
+
+ // We create a new thread to process next block if the following
+ // conditions are fulfilled:
+ // 1. There are new blocks in d_in_buffer that have not been processed yet
+ // (d_well_count < d_in_dwell_count).
+ // 2. No other acquisition_core thead is working (!d_core_working).
+ // 3. d_state==1. We need to check again d_state because it can be modified at any
+ // moment by the external thread (may have changed since checked in the switch()).
+ // If the external thread has already declared positive (d_state=2) or negative
+ // (d_state=3) acquisition, we don't have to process next block!!
+ if ((d_well_count < d_in_dwell_count) && !d_core_working && d_state==1)
+ {
+ d_core_working = true;
+ boost::thread(&pcps_multithread_acquisition_cc::acquisition_core, this);
+ }
break;
}
case 2:
{
- d_sample_counter += d_fft_size * ninput_items[0]; // sample counter
- consume_each(ninput_items[0]);
- break;
- }
- case 3:
- {
-
// Declare positive acquisition using a message queue
DLOG(INFO) << "positive acquisition";
DLOG(INFO) << "satellite " << d_gnss_synchro->System << " " << d_gnss_synchro->PRN;
@@ -363,7 +405,6 @@ int pcps_multithread_acquisition_cc::general_work(int noutput_items,
d_state = 0;
d_sample_counter += d_fft_size * ninput_items[0]; // sample counter
- consume_each(ninput_items[0]);
acquisition_message = 1;
d_channel_internal_queue->push(acquisition_message);
@@ -371,7 +412,7 @@ int pcps_multithread_acquisition_cc::general_work(int noutput_items,
break;
}
- case 4:
+ case 3:
{
// Declare negative acquisition using a message queue
DLOG(INFO) << "negative acquisition";
@@ -388,7 +429,6 @@ int pcps_multithread_acquisition_cc::general_work(int noutput_items,
d_state = 0;
d_sample_counter += d_fft_size * ninput_items[0]; // sample counter
- consume_each(ninput_items[0]);
acquisition_message = 2;
d_channel_internal_queue->push(acquisition_message);
@@ -397,5 +437,7 @@ int pcps_multithread_acquisition_cc::general_work(int noutput_items,
}
}
+ consume_each(ninput_items[0]);
+
return 0;
}
diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.h b/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.h
index d9a5ba8be..65df7ab0e 100644
--- a/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.h
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.h
@@ -134,9 +134,13 @@ private:
std::ofstream d_dump_file;
bool d_active;
int d_state;
+ bool d_core_working;
bool d_dump;
unsigned int d_channel;
std::string d_dump_filename;
+ gr_complex** d_in_buffer;
+ std::vector d_sample_counter_buffer;
+ unsigned int d_in_dwell_count;
public:
/*!
@@ -237,7 +241,7 @@ public:
gr_vector_const_void_star &input_items,
gr_vector_void_star &output_items);
- void perform_acquisition(const gr_complex* in, const unsigned int samplestamp);
+ void acquisition_core();
};
#endif /* GNSS_SDR_PCPS_MULTITHREAD_ACQUISITION_CC_H_*/
diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.cc
new file mode 100644
index 000000000..aa74ce5c0
--- /dev/null
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.cc
@@ -0,0 +1,809 @@
+/*!
+ * \file pcps_opencl_acquisition_cc.cc
+ * \brief This class implements a Parallel Code Phase Search Acquisition
+ * using OpenCL to offload some functions to the GPU.
+ *
+ * Acquisition strategy (Kay Borre book + CFAR threshold).
+ *
+ * - Compute the input signal power estimation
+ *
- Doppler serial search loop
+ *
- Perform the FFT-based circular convolution (parallel time search)
+ *
- Record the maximum peak and the associated synchronization parameters
+ *
- Compute the test statistics and compare to the threshold
+ *
- Declare positive or negative acquisition using a message queue
+ *
+ *
+ * Kay Borre book: K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
+ * "A Software-Defined GPS and Galileo Receiver. A Single-Frequency
+ * Approach", Birkha user, 2007. pp 81-84
+ *
+ * \authors
+ * - Javier Arribas, 2011. jarribas(at)cttc.es
+ *
- Luis Esteve, 2012. luis(at)epsilon-formacion.com
+ *
- Marc Molina, 2013. marc.molina.pena@gmail.com
+ *
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see .
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#include "pcps_opencl_acquisition_cc.h"
+#include "gnss_signal_processing.h"
+#include "control_message_factory.h"
+#include "fft_base_kernels.h"
+#include "fft_internal.h"
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using google::LogMessage;
+
+pcps_opencl_acquisition_cc_sptr pcps_make_opencl_acquisition_cc(
+ unsigned int sampled_ms, unsigned int max_dwells,
+ unsigned int doppler_max, long freq, long fs_in,
+ int samples_per_ms, int samples_per_code,
+ bool bit_transition_flag,
+ gr::msg_queue::sptr queue, bool dump,
+ std::string dump_filename)
+{
+
+ return pcps_opencl_acquisition_cc_sptr(
+ new pcps_opencl_acquisition_cc(sampled_ms, max_dwells, doppler_max, freq, fs_in, samples_per_ms,
+ samples_per_code, bit_transition_flag, queue, dump, dump_filename));
+}
+
+pcps_opencl_acquisition_cc::pcps_opencl_acquisition_cc(
+ unsigned int sampled_ms, unsigned int max_dwells,
+ unsigned int doppler_max, long freq, long fs_in,
+ int samples_per_ms, int samples_per_code,
+ bool bit_transition_flag,
+ gr::msg_queue::sptr queue, bool dump,
+ std::string dump_filename) :
+ gr::block("pcps_opencl_acquisition_cc",
+ gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms),
+ gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms))
+{
+ d_sample_counter = 0; // SAMPLE COUNTER
+ d_active = false;
+ d_state = 0;
+ d_core_working = false;
+ d_queue = queue;
+ d_freq = freq;
+ d_fs_in = fs_in;
+ d_samples_per_ms = samples_per_ms;
+ d_samples_per_code = samples_per_code;
+ d_sampled_ms = sampled_ms;
+ d_max_dwells = max_dwells;
+ d_well_count = 0;
+ d_doppler_max = doppler_max;
+ d_fft_size = d_sampled_ms * d_samples_per_ms;
+ d_fft_size_pow2 = pow(2,ceil(log2(2*d_fft_size)));
+ d_mag = 0;
+ d_input_power = 0.0;
+ d_num_doppler_bins = 0;
+ d_bit_transition_flag = bit_transition_flag;
+ d_in_dwell_count = 0;
+ d_cl_fft_batch_size = 1;
+
+ d_in_buffer = new gr_complex*[d_max_dwells];
+
+ //todo: do something if posix_memalign fails
+ for (unsigned int i = 0; i < d_max_dwells; i++)
+ {
+ if (posix_memalign((void**)&d_in_buffer[i], 16,
+ d_fft_size * sizeof(gr_complex)) == 0){};
+
+ }
+ if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(float)) == 0){};
+ if (posix_memalign((void**)&d_fft_codes, 16, d_fft_size_pow2 * sizeof(gr_complex)) == 0){};
+ if (posix_memalign((void**)&d_zero_vector, 16, (d_fft_size_pow2-d_fft_size) * sizeof(gr_complex)) == 0){};
+
+ for (unsigned int i = 0; i < (d_fft_size_pow2-d_fft_size); i++)
+ {
+ d_zero_vector[i] = gr_complex(0.0,0.0);
+ }
+
+ d_opencl = init_opencl_environment("math_kernel.cl");
+
+ if (d_opencl != 0)
+ {
+ // Direct FFT
+ d_fft_if = new gr::fft::fft_complex(d_fft_size, true);
+
+ // Inverse FFT
+ d_ifft = new gr::fft::fft_complex(d_fft_size, false);
+ }
+
+ // For dumping samples into a file
+ d_dump = dump;
+ d_dump_filename = dump_filename;
+
+}
+
+pcps_opencl_acquisition_cc::~pcps_opencl_acquisition_cc()
+{
+ if (d_num_doppler_bins > 0)
+ {
+ for (unsigned int i = 0; i < d_num_doppler_bins; i++)
+ {
+ free(d_grid_doppler_wipeoffs[i]);
+ }
+ delete[] d_grid_doppler_wipeoffs;
+ }
+
+ for (unsigned int i = 0; i < d_max_dwells; i++)
+ {
+ free(d_in_buffer[i]);
+ }
+ delete[] d_in_buffer;
+
+ free(d_fft_codes);
+ free(d_magnitude);
+ free(d_zero_vector);
+
+ if (d_opencl == 0)
+ {
+ delete d_cl_queue;
+ delete d_cl_buffer_in;
+ delete d_cl_buffer_1;
+ delete d_cl_buffer_2;
+ delete d_cl_buffer_magnitude;
+ delete d_cl_buffer_fft_codes;
+ if(d_num_doppler_bins > 0)
+ {
+ delete[] d_cl_buffer_grid_doppler_wipeoffs;
+ }
+
+ clFFT_DestroyPlan(d_cl_fft_plan);
+ }
+ else
+ {
+ delete d_ifft;
+ delete d_fft_if;
+ }
+
+ if (d_dump)
+ {
+ d_dump_file.close();
+ }
+}
+
+int pcps_opencl_acquisition_cc::init_opencl_environment(std::string kernel_filename)
+{
+ //get all platforms (drivers)
+ std::vector all_platforms;
+ cl::Platform::get(&all_platforms);
+
+ if(all_platforms.size()==0)
+ {
+ std::cout << "No OpenCL platforms found. Check OpenCL installation!" << std::endl;
+ return 1;
+ }
+
+ d_cl_platform = all_platforms[0]; //get default platform
+ std::cout << "Using platform: " << d_cl_platform.getInfo()
+ << std::endl;
+
+ //get default GPU device of the default platform
+ std::vector gpu_devices;
+ d_cl_platform.getDevices(CL_DEVICE_TYPE_GPU, &gpu_devices);
+
+ if(gpu_devices.size()==0)
+ {
+ std::cout << "No GPU devices found. Check OpenCL installation!" << std::endl;
+ return 2;
+ }
+
+ d_cl_device = gpu_devices[0];
+
+ std::vector device;
+ device.push_back(d_cl_device);
+ std::cout << "Using device: " << d_cl_device.getInfo() << std::endl;
+
+ cl::Context context(device);
+ d_cl_context = context;
+
+ // build the program from the source in the file
+ std::ifstream kernel_file(kernel_filename, std::ifstream::in);
+ std::string kernel_code(std::istreambuf_iterator(kernel_file),
+ (std::istreambuf_iterator()));
+ kernel_file.close();
+
+// std::cout << "Kernel code: \n" << kernel_code << std::endl;
+
+ cl::Program::Sources sources;
+
+ sources.push_back({kernel_code.c_str(),kernel_code.length()});
+
+ cl::Program program(context,sources);
+ if(program.build(device)!=CL_SUCCESS)
+ {
+ std::cout << " Error building: "
+ << program.getBuildInfo(device[0])
+ << std::endl;
+ return 3;
+ }
+ d_cl_program = program;
+
+ // create buffers on the device
+ d_cl_buffer_in = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(gr_complex)*d_fft_size);
+ d_cl_buffer_fft_codes = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(gr_complex)*d_fft_size_pow2);
+ d_cl_buffer_1 = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(gr_complex)*d_fft_size_pow2);
+ d_cl_buffer_2 = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(gr_complex)*d_fft_size_pow2);
+ d_cl_buffer_magnitude = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(float)*d_fft_size);
+
+ //create queue to which we will push commands for the device.
+ d_cl_queue = new cl::CommandQueue(d_cl_context,d_cl_device);
+
+ //create FFT plan
+ cl_int err;
+ clFFT_Dim3 dim = {d_fft_size_pow2, 1, 1};
+
+ d_cl_fft_plan = clFFT_CreatePlan(d_cl_context(), dim, clFFT_1D,
+ clFFT_InterleavedComplexFormat, &err);
+
+ if (err != 0)
+ {
+ delete d_cl_queue;
+ delete d_cl_buffer_in;
+ delete d_cl_buffer_1;
+ delete d_cl_buffer_2;
+ delete d_cl_buffer_magnitude;
+ delete d_cl_buffer_fft_codes;
+
+ std::cout << "Error creating OpenCL FFT plan." << std::endl;
+ return 4;
+ }
+
+ return 0;
+}
+
+void pcps_opencl_acquisition_cc::init()
+{
+ d_gnss_synchro->Acq_delay_samples = 0.0;
+ d_gnss_synchro->Acq_doppler_hz = 0.0;
+ d_gnss_synchro->Acq_samplestamp_samples = 0;
+ d_mag = 0.0;
+ d_input_power = 0.0;
+
+ // Count the number of bins
+ d_num_doppler_bins = 0;
+ for (int doppler = (int)(-d_doppler_max);
+ doppler <= (int)d_doppler_max;
+ doppler += d_doppler_step)
+ {
+ d_num_doppler_bins++;
+ }
+
+ // Create the carrier Doppler wipeoff signals
+ d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins];
+ if (d_opencl == 0)
+ {
+ d_cl_buffer_grid_doppler_wipeoffs = new cl::Buffer*[d_num_doppler_bins];
+ }
+
+ for (unsigned int doppler_index=0;doppler_indexenqueueWriteBuffer(*(d_cl_buffer_grid_doppler_wipeoffs[doppler_index]),
+ CL_TRUE,0,sizeof(gr_complex)*d_fft_size,
+ d_grid_doppler_wipeoffs[doppler_index]);
+ }
+ }
+
+ // zero padding in buffer_1 (FFT input)
+ if (d_opencl == 0)
+ {
+ d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_1,CL_TRUE,sizeof(gr_complex)*d_fft_size,
+ sizeof(gr_complex)*(d_fft_size_pow2-d_fft_size),d_zero_vector);
+ }
+}
+
+void pcps_opencl_acquisition_cc::set_local_code(std::complex * code)
+{
+ if(d_opencl == 0)
+ {
+ d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_2,CL_TRUE,0,
+ sizeof(gr_complex)*d_fft_size, code);
+
+ d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_2,CL_TRUE,sizeof(gr_complex)*d_fft_size,
+ sizeof(gr_complex)*(d_fft_size_pow2 - 2*d_fft_size),
+ d_zero_vector);
+
+ d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_2,CL_TRUE,sizeof(gr_complex)
+ *(d_fft_size_pow2 - d_fft_size),
+ sizeof(gr_complex)*d_fft_size, code);
+
+ clFFT_ExecuteInterleaved((*d_cl_queue)(), d_cl_fft_plan, d_cl_fft_batch_size,
+ clFFT_Forward, (*d_cl_buffer_2)(), (*d_cl_buffer_2)(),
+ 0, NULL, NULL);
+
+ //Conjucate the local code
+ cl::Kernel kernel=cl::Kernel(d_cl_program,"conj_vector");
+ kernel.setArg(0,*d_cl_buffer_2); //input
+ kernel.setArg(1,*d_cl_buffer_fft_codes); //output
+ d_cl_queue->enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(d_fft_size_pow2),cl::NullRange);
+ }
+ else
+ {
+ memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size);
+
+ d_fft_if->execute(); // We need the FFT of local code
+
+ //Conjugate the local code
+ if (is_unaligned())
+ {
+ volk_32fc_conjugate_32fc_u(d_fft_codes,d_fft_if->get_outbuf(),d_fft_size);
+ }
+ else
+ {
+ volk_32fc_conjugate_32fc_a(d_fft_codes,d_fft_if->get_outbuf(),d_fft_size);
+ }
+ }
+}
+
+void pcps_opencl_acquisition_cc::acquisition_core_volk()
+{
+ // initialize acquisition algorithm
+ int doppler;
+ unsigned int indext = 0;
+ float magt = 0.0;
+ float fft_normalization_factor = (float)d_fft_size * (float)d_fft_size;
+ gr_complex* in = d_in_buffer[d_well_count];
+ unsigned long int samplestamp = d_sample_counter_buffer[d_well_count];
+
+ d_input_power = 0.0;
+ d_mag = 0.0;
+
+ d_well_count++;
+
+ DLOG(INFO) << "Channel: " << d_channel
+ << " , doing acquisition of satellite: " << d_gnss_synchro->System << " "<< d_gnss_synchro->PRN
+ << " ,sample stamp: " << d_sample_counter << ", threshold: "
+ << d_threshold << ", doppler_max: " << d_doppler_max
+ << ", doppler_step: " << d_doppler_step;
+
+ // 1- Compute the input signal power estimation
+ volk_32fc_magnitude_squared_32f_a(d_magnitude, in, d_fft_size);
+ volk_32f_accumulator_s32f_a(&d_input_power, d_magnitude, d_fft_size);
+ d_input_power /= (float)d_fft_size;
+
+ // 2- Doppler frequency search loop
+ for (unsigned int doppler_index=0;doppler_indexget_inbuf(), in,
+ d_grid_doppler_wipeoffs[doppler_index], d_fft_size);
+
+ // 3- Perform the FFT-based convolution (parallel time search)
+ // Compute the FFT of the carrier wiped--off incoming signal
+ d_fft_if->execute();
+
+ // Multiply carrier wiped--off, Fourier transformed incoming signal
+ // with the local FFT'd code reference using SIMD operations with VOLK library
+ volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(),
+ d_fft_if->get_outbuf(), d_fft_codes, d_fft_size);
+
+ // compute the inverse FFT
+ d_ifft->execute();
+
+ // Search maximum
+ volk_32fc_magnitude_squared_32f_a(d_magnitude, d_ifft->get_outbuf(), d_fft_size);
+ volk_32f_index_max_16u_a(&indext, d_magnitude, d_fft_size);
+
+ // Normalize the maximum value to correct the scale factor introduced by FFTW
+ magt = d_magnitude[indext] / (fft_normalization_factor * fft_normalization_factor);
+
+ // 4- record the maximum peak and the associated synchronization parameters
+ if (d_mag < magt)
+ {
+ d_mag = magt;
+
+ // In case that d_bit_transition_flag = true, we compare the potentially
+ // new maximum test statistics (d_mag/d_input_power) with the value in
+ // d_test_statistics. When the second dwell is being processed, the value
+ // of d_mag/d_input_power could be lower than d_test_statistics (i.e,
+ // the maximum test statistics in the previous dwell is greater than
+ // current d_mag/d_input_power). Note that d_test_statistics is not
+ // restarted between consecutive dwells in multidwell operation.
+ if (d_test_statistics < (d_mag / d_input_power) || !d_bit_transition_flag)
+ {
+ d_gnss_synchro->Acq_delay_samples = (double)(indext % d_samples_per_code);
+ d_gnss_synchro->Acq_doppler_hz = (double)doppler;
+ d_gnss_synchro->Acq_samplestamp_samples = samplestamp;
+
+ // 5- Compute the test statistics and compare to the threshold
+ //d_test_statistics = 2 * d_fft_size * d_mag / d_input_power;
+ d_test_statistics = d_mag / d_input_power;
+ }
+ }
+
+ // Record results to file if required
+ if (d_dump)
+ {
+ std::stringstream filename;
+ std::streamsize n = 2 * sizeof(float) * (d_fft_size); // complex file write
+ filename.str("");
+ filename << "../data/test_statistics_" << d_gnss_synchro->System
+ <<"_" << d_gnss_synchro->Signal << "_sat_"
+ << d_gnss_synchro->PRN << "_doppler_" << doppler << ".dat";
+ d_dump_file.open(filename.str().c_str(), std::ios::out | std::ios::binary);
+ d_dump_file.write((char*)d_ifft->get_outbuf(), n); //write directly |abs(x)|^2 in this Doppler bin?
+ d_dump_file.close();
+ }
+ }
+
+ if (!d_bit_transition_flag)
+ {
+ if (d_test_statistics > d_threshold)
+ {
+ d_state = 2; // Positive acquisition
+ }
+ else if (d_well_count == d_max_dwells)
+ {
+ d_state = 3; // Negative acquisition
+ }
+ }
+ else
+ {
+ if (d_well_count == d_max_dwells) // d_max_dwells = 2
+ {
+ if (d_test_statistics > d_threshold)
+ {
+ d_state = 2; // Positive acquisition
+ }
+ else
+ {
+ d_state = 3; // Negative acquisition
+ }
+ }
+ }
+
+ d_core_working = false;
+}
+
+void pcps_opencl_acquisition_cc::acquisition_core_opencl()
+{
+ // initialize acquisition algorithm
+ int doppler;
+ unsigned int indext = 0;
+ float magt = 0.0;
+ float fft_normalization_factor = ((float)d_fft_size_pow2 * (float)d_fft_size); //This works, but I am not sure why.
+ gr_complex* in = d_in_buffer[d_well_count];
+ unsigned long int samplestamp = d_sample_counter_buffer[d_well_count];
+
+ d_input_power = 0.0;
+ d_mag = 0.0;
+
+ // write input vector in buffer of OpenCL device
+ d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_in,CL_TRUE,0,sizeof(gr_complex)*d_fft_size,in);
+
+ d_well_count++;
+
+// struct timeval tv;
+// long long int begin = 0;
+// long long int end = 0;
+
+// gettimeofday(&tv, NULL);
+// begin = tv.tv_sec *1e6 + tv.tv_usec;
+
+ DLOG(INFO) << "Channel: " << d_channel
+ << " , doing acquisition of satellite: " << d_gnss_synchro->System << " "<< d_gnss_synchro->PRN
+ << " ,sample stamp: " << d_sample_counter << ", threshold: "
+ << d_threshold << ", doppler_max: " << d_doppler_max
+ << ", doppler_step: " << d_doppler_step;
+
+ // 1- Compute the input signal power estimation
+ volk_32fc_magnitude_squared_32f_a(d_magnitude, in, d_fft_size);
+ volk_32f_accumulator_s32f_a(&d_input_power, d_magnitude, d_fft_size);
+ d_input_power /= (float)d_fft_size;
+
+ cl::Kernel kernel;
+
+ // 2- Doppler frequency search loop
+ for (unsigned int doppler_index=0;doppler_indexenqueueNDRangeKernel(kernel,cl::NullRange, cl::NDRange(d_fft_size),
+ cl::NullRange);
+
+ // In the previous operation, we store the result in the first d_fft_size positions
+ // of d_cl_buffer_1. The rest d_fft_size_pow2-d_fft_size already have zeros
+ // (zero-padding is made in init() for optimization purposes).
+
+ clFFT_ExecuteInterleaved((*d_cl_queue)(), d_cl_fft_plan, d_cl_fft_batch_size,
+ clFFT_Forward,(*d_cl_buffer_1)(), (*d_cl_buffer_2)(),
+ 0, NULL, NULL);
+
+ // Multiply carrier wiped--off, Fourier transformed incoming signal
+ // with the local FFT'd code reference
+ kernel = cl::Kernel(d_cl_program,"mult_vectors");
+ kernel.setArg(0,*d_cl_buffer_2); //input 1
+ kernel.setArg(1,*d_cl_buffer_fft_codes); //input 2
+ kernel.setArg(2,*d_cl_buffer_2); //output
+ d_cl_queue->enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(d_fft_size_pow2),
+ cl::NullRange);
+
+ // compute the inverse FFT
+ clFFT_ExecuteInterleaved((*d_cl_queue)(), d_cl_fft_plan, d_cl_fft_batch_size,
+ clFFT_Inverse, (*d_cl_buffer_2)(), (*d_cl_buffer_2)(),
+ 0, NULL, NULL);
+
+ // Compute magnitude
+ kernel = cl::Kernel(d_cl_program,"magnitude_squared");
+ kernel.setArg(0,*d_cl_buffer_2); //input 1
+ kernel.setArg(1,*d_cl_buffer_magnitude); //output
+ d_cl_queue->enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(d_fft_size),
+ cl::NullRange);
+
+ // This is the only function that blocks this thread until all previously enqueued
+ // OpenCL commands are completed.
+ d_cl_queue->enqueueReadBuffer(*d_cl_buffer_magnitude, CL_TRUE, 0,
+ sizeof(float)*d_fft_size,d_magnitude);
+
+ // Search maximum
+ // @TODO: find an efficient way to search the maximum with OpenCL in the GPU.
+ volk_32f_index_max_16u_a(&indext, d_magnitude, d_fft_size);
+
+ // Normalize the maximum value to correct the scale factor introduced by FFTW
+ magt = d_magnitude[indext] / (fft_normalization_factor * fft_normalization_factor);
+
+ // 4- record the maximum peak and the associated synchronization parameters
+ if (d_mag < magt)
+ {
+ d_mag = magt;
+
+ // In case that d_bit_transition_flag = true, we compare the potentially
+ // new maximum test statistics (d_mag/d_input_power) with the value in
+ // d_test_statistics. When the second dwell is being processed, the value
+ // of d_mag/d_input_power could be lower than d_test_statistics (i.e,
+ // the maximum test statistics in the previous dwell is greater than
+ // current d_mag/d_input_power). Note that d_test_statistics is not
+ // restarted between consecutive dwells in multidwell operation.
+ if (d_test_statistics < (d_mag / d_input_power) || !d_bit_transition_flag)
+ {
+ d_gnss_synchro->Acq_delay_samples = (double)(indext % d_samples_per_code);
+ d_gnss_synchro->Acq_doppler_hz = (double)doppler;
+ d_gnss_synchro->Acq_samplestamp_samples = samplestamp;
+
+ // 5- Compute the test statistics and compare to the threshold
+ //d_test_statistics = 2 * d_fft_size * d_mag / d_input_power;
+ d_test_statistics = d_mag / d_input_power;
+ }
+ }
+
+ // Record results to file if required
+ if (d_dump)
+ {
+ std::stringstream filename;
+ std::streamsize n = 2 * sizeof(float) * (d_fft_size); // complex file write
+ filename.str("");
+ filename << "../data/test_statistics_" << d_gnss_synchro->System
+ <<"_" << d_gnss_synchro->Signal << "_sat_"
+ << d_gnss_synchro->PRN << "_doppler_" << doppler << ".dat";
+ d_dump_file.open(filename.str().c_str(), std::ios::out | std::ios::binary);
+ d_dump_file.write((char*)d_ifft->get_outbuf(), n); //write directly |abs(x)|^2 in this Doppler bin?
+ d_dump_file.close();
+ }
+ }
+
+// gettimeofday(&tv, NULL);
+// end = tv.tv_sec *1e6 + tv.tv_usec;
+// std::cout << "Acq time = " << (end-begin) << " us" << std::endl;
+
+ if (!d_bit_transition_flag)
+ {
+ if (d_test_statistics > d_threshold)
+ {
+ d_state = 2; // Positive acquisition
+ }
+ else if (d_well_count == d_max_dwells)
+ {
+ d_state = 3; // Negative acquisition
+ }
+ }
+ else
+ {
+ if (d_well_count == d_max_dwells) // d_max_dwells = 2
+ {
+ if (d_test_statistics > d_threshold)
+ {
+ d_state = 2; // Positive acquisition
+ }
+ else
+ {
+ d_state = 3; // Negative acquisition
+ }
+ }
+ }
+
+ d_core_working = false;
+}
+
+int pcps_opencl_acquisition_cc::general_work(int noutput_items,
+ gr_vector_int &ninput_items, gr_vector_const_void_star &input_items,
+ gr_vector_void_star &output_items)
+{
+
+ int acquisition_message = -1; //0=STOP_CHANNEL 1=ACQ_SUCCEES 2=ACQ_FAIL
+
+ switch (d_state)
+ {
+ case 0:
+ {
+ if (d_active)
+ {
+ //restart acquisition variables
+ d_gnss_synchro->Acq_delay_samples = 0.0;
+ d_gnss_synchro->Acq_doppler_hz = 0.0;
+ d_gnss_synchro->Acq_samplestamp_samples = 0;
+ d_well_count = 0;
+ d_mag = 0.0;
+ d_input_power = 0.0;
+ d_test_statistics = 0.0;
+ d_in_dwell_count = 0;
+ d_sample_counter_buffer.clear();
+
+ d_state = 1;
+ }
+
+ d_sample_counter += d_fft_size * ninput_items[0]; // sample counter
+
+ break;
+ }
+
+ case 1:
+ {
+ if (d_in_dwell_count < d_max_dwells)
+ {
+ // Fill internal buffer with d_max_dwells signal blocks. This step ensures that
+ // consecutive signal blocks will be processed in multi-dwell operation. This is
+ // essential when d_bit_transition_flag = true.
+ unsigned int num_dwells = std::min((int)(d_max_dwells-d_in_dwell_count),ninput_items[0]);
+ for (unsigned int i = 0; i < num_dwells; i++)
+ {
+ memcpy(d_in_buffer[d_in_dwell_count++], (gr_complex*)input_items[i],
+ sizeof(gr_complex)*d_fft_size);
+ d_sample_counter += d_fft_size;
+ d_sample_counter_buffer.push_back(d_sample_counter);
+ }
+
+ if (ninput_items[0] > (int)num_dwells)
+ {
+ d_sample_counter += d_fft_size * (ninput_items[0]-num_dwells);
+ }
+ }
+ else
+ {
+ // We already have d_max_dwells consecutive blocks in the internal buffer,
+ // just skip input blocks.
+ d_sample_counter += d_fft_size * ninput_items[0];
+ }
+
+ // We create a new thread to process next block if the following
+ // conditions are fulfilled:
+ // 1. There are new blocks in d_in_buffer that have not been processed yet
+ // (d_well_count < d_in_dwell_count).
+ // 2. No other acquisition_core thead is working (!d_core_working).
+ // 3. d_state==1. We need to check again d_state because it can be modified at any
+ // moment by the external thread (may have changed since checked in the switch()).
+ // If the external thread has already declared positive (d_state=2) or negative
+ // (d_state=3) acquisition, we don't have to process next block!!
+ if ((d_well_count < d_in_dwell_count) && !d_core_working && d_state==1)
+ {
+ d_core_working = true;
+ if (d_opencl == 0)
+ { // Use OpenCL implementation
+ boost::thread(&pcps_opencl_acquisition_cc::acquisition_core_opencl, this);
+ }
+ else
+ { // Use Volk implementation
+ boost::thread(&pcps_opencl_acquisition_cc::acquisition_core_volk, this);
+ }
+ }
+
+ break;
+ }
+
+ case 2:
+ {
+ // Declare positive acquisition using a message queue
+ DLOG(INFO) << "positive acquisition";
+ DLOG(INFO) << "satellite " << d_gnss_synchro->System << " " << d_gnss_synchro->PRN;
+ DLOG(INFO) << "sample_stamp " << d_sample_counter;
+ DLOG(INFO) << "test statistics value " << d_test_statistics;
+ DLOG(INFO) << "test statistics threshold " << d_threshold;
+ DLOG(INFO) << "code phase " << d_gnss_synchro->Acq_delay_samples;
+ DLOG(INFO) << "doppler " << d_gnss_synchro->Acq_doppler_hz;
+ DLOG(INFO) << "magnitude " << d_mag;
+ DLOG(INFO) << "input signal power " << d_input_power;
+
+ d_active = false;
+ d_state = 0;
+
+ d_sample_counter += d_fft_size * ninput_items[0]; // sample counter
+
+ acquisition_message = 1;
+ d_channel_internal_queue->push(acquisition_message);
+
+ break;
+ }
+
+ case 3:
+ {
+ // Declare negative acquisition using a message queue
+ DLOG(INFO) << "negative acquisition";
+ DLOG(INFO) << "satellite " << d_gnss_synchro->System << " " << d_gnss_synchro->PRN;
+ DLOG(INFO) << "sample_stamp " << d_sample_counter;
+ DLOG(INFO) << "test statistics value " << d_test_statistics;
+ DLOG(INFO) << "test statistics threshold " << d_threshold;
+ DLOG(INFO) << "code phase " << d_gnss_synchro->Acq_delay_samples;
+ DLOG(INFO) << "doppler " << d_gnss_synchro->Acq_doppler_hz;
+ DLOG(INFO) << "magnitude " << d_mag;
+ DLOG(INFO) << "input signal power " << d_input_power;
+
+ d_active = false;
+ d_state = 0;
+
+ d_sample_counter += d_fft_size * ninput_items[0]; // sample counter
+
+ acquisition_message = 2;
+ d_channel_internal_queue->push(acquisition_message);
+
+ break;
+ }
+ }
+
+ consume_each(ninput_items[0]);
+
+ return 0;
+}
diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.h b/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.h
new file mode 100644
index 000000000..b2648c1db
--- /dev/null
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.h
@@ -0,0 +1,277 @@
+/*!
+ * \file pcps_opencl_acquisition_cc.h
+ * \brief This class implements a Parallel Code Phase Search Acquisition
+ * using OpenCL to offload some functions to the GPU.
+ *
+ * Acquisition strategy (Kay Borre book + CFAR threshold).
+ *
+ * - Compute the input signal power estimation
+ *
- Doppler serial search loop
+ *
- Perform the FFT-based circular convolution (parallel time search)
+ *
- Record the maximum peak and the associated synchronization parameters
+ *
- Compute the test statistics and compare to the threshold
+ *
- Declare positive or negative acquisition using a message queue
+ *
+ *
+ * Kay Borre book: K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen,
+ * "A Software-Defined GPS and Galileo Receiver. A Single-Frequency
+ * Approach", Birkha user, 2007. pp 81-84
+ *
+ * \authors
+ * - Javier Arribas, 2011. jarribas(at)cttc.es
+ *
- Luis Esteve, 2012. luis(at)epsilon-formacion.com
+ *
- Marc Molina, 2013. marc.molina.pena@gmail.com
+ *
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see .
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifndef GNSS_SDR_PCPS_OPENCL_ACQUISITION_CC_H_
+#define GNSS_SDR_PCPS_OPENCL_ACQUISITION_CC_H_
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include "concurrent_queue.h"
+#include "fft_internal.h"
+#include "gnss_synchro.h"
+
+#ifdef APPLE
+ #include
+#else
+ #include
+#endif
+
+class pcps_opencl_acquisition_cc;
+
+typedef boost::shared_ptr pcps_opencl_acquisition_cc_sptr;
+
+pcps_opencl_acquisition_cc_sptr
+pcps_make_opencl_acquisition_cc(unsigned int sampled_ms, unsigned int max_dwells,
+ unsigned int doppler_max, long freq, long fs_in,
+ int samples_per_ms, int samples_per_code,
+ bool bit_transition_flag,
+ gr::msg_queue::sptr queue, bool dump,
+ std::string dump_filename);
+
+/*!
+ * \brief This class implements a Parallel Code Phase Search Acquisition.
+ *
+ * Check \ref Navitec2012 "An Open Source Galileo E1 Software Receiver",
+ * Algorithm 1, for a pseudocode description of this implementation.
+ */
+class pcps_opencl_acquisition_cc: public gr::block
+{
+private:
+ friend pcps_opencl_acquisition_cc_sptr
+ pcps_make_opencl_acquisition_cc(unsigned int sampled_ms, unsigned int max_dwells,
+ unsigned int doppler_max, long freq, long fs_in,
+ int samples_per_ms, int samples_per_code,
+ bool bit_transition_flag,
+ gr::msg_queue::sptr queue, bool dump,
+ std::string dump_filename);
+
+
+ pcps_opencl_acquisition_cc(unsigned int sampled_ms, unsigned int max_dwells,
+ unsigned int doppler_max, long freq, long fs_in,
+ int samples_per_ms, int samples_per_code,
+ bool bit_transition_flag,
+ gr::msg_queue::sptr queue, bool dump,
+ std::string dump_filename);
+
+ void calculate_magnitudes(gr_complex* fft_begin, int doppler_shift,
+ int doppler_offset);
+
+ int init_opencl_environment(std::string kernel_filename);
+
+ long d_fs_in;
+ long d_freq;
+ int d_samples_per_ms;
+ int d_samples_per_code;
+ unsigned int d_doppler_resolution;
+ float d_threshold;
+ std::string d_satellite_str;
+ unsigned int d_doppler_max;
+ unsigned int d_doppler_step;
+ unsigned int d_sampled_ms;
+ unsigned int d_max_dwells;
+ unsigned int d_well_count;
+ unsigned int d_fft_size;
+ unsigned int d_fft_size_pow2;
+ int* d_max_doppler_indexs;
+ unsigned long int d_sample_counter;
+ gr_complex** d_grid_doppler_wipeoffs;
+ unsigned int d_num_doppler_bins;
+ gr_complex* d_fft_codes;
+ gr::fft::fft_complex* d_fft_if;
+ gr::fft::fft_complex* d_ifft;
+ Gnss_Synchro *d_gnss_synchro;
+ unsigned int d_code_phase;
+ float d_doppler_freq;
+ float d_mag;
+ float* d_magnitude;
+ float d_input_power;
+ float d_test_statistics;
+ bool d_bit_transition_flag;
+ gr::msg_queue::sptr d_queue;
+ concurrent_queue *d_channel_internal_queue;
+ std::ofstream d_dump_file;
+ bool d_active;
+ int d_state;
+ bool d_core_working;
+ bool d_dump;
+ unsigned int d_channel;
+ std::string d_dump_filename;
+ gr_complex* d_zero_vector;
+ gr_complex** d_in_buffer;
+ std::vector d_sample_counter_buffer;
+ unsigned int d_in_dwell_count;
+
+ cl::Platform d_cl_platform;
+ cl::Device d_cl_device;
+ cl::Context d_cl_context;
+ cl::Program d_cl_program;
+ cl::Buffer* d_cl_buffer_in;
+ cl::Buffer* d_cl_buffer_fft_codes;
+ cl::Buffer* d_cl_buffer_1;
+ cl::Buffer* d_cl_buffer_2;
+ cl::Buffer* d_cl_buffer_magnitude;
+ cl::Buffer** d_cl_buffer_grid_doppler_wipeoffs;
+ cl::CommandQueue* d_cl_queue;
+ clFFT_Plan d_cl_fft_plan;
+ cl_int d_cl_fft_batch_size;
+
+ int d_opencl;
+
+public:
+ /*!
+ * \brief Default destructor.
+ */
+ ~pcps_opencl_acquisition_cc();
+
+ /*!
+ * \brief Set acquisition/tracking common Gnss_Synchro object pointer
+ * to exchange synchronization data between acquisition and tracking blocks.
+ * \param p_gnss_synchro Satellite information shared by the processing blocks.
+ */
+ void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro)
+ {
+ d_gnss_synchro = p_gnss_synchro;
+ }
+
+ /*!
+ * \brief Returns the maximum peak of grid search.
+ */
+ unsigned int mag()
+ {
+ return d_mag;
+ }
+
+ /*!
+ * \brief Initializes acquisition algorithm.
+ */
+ void init();
+
+ /*!
+ * \brief Sets local code for PCPS acquisition algorithm.
+ * \param code - Pointer to the PRN code.
+ */
+ void set_local_code(std::complex * code);
+
+ /*!
+ * \brief Starts acquisition algorithm, turning from standby mode to
+ * active mode
+ * \param active - bool that activates/deactivates the block.
+ */
+ void set_active(bool active)
+ {
+ d_active = active;
+ }
+
+ /*!
+ * \brief Set acquisition channel unique ID
+ * \param channel - receiver channel.
+ */
+ void set_channel(unsigned int channel)
+ {
+ d_channel = channel;
+ }
+
+ /*!
+ * \brief Set statistics threshold of PCPS algorithm.
+ * \param threshold - Threshold for signal detection (check \ref Navitec2012,
+ * Algorithm 1, for a definition of this threshold).
+ */
+ void set_threshold(float threshold)
+ {
+ d_threshold = threshold;
+ }
+
+ /*!
+ * \brief Set maximum Doppler grid search
+ * \param doppler_max - Maximum Doppler shift considered in the grid search [Hz].
+ */
+ void set_doppler_max(unsigned int doppler_max)
+ {
+ d_doppler_max = doppler_max;
+ }
+
+ /*!
+ * \brief Set Doppler steps for the grid search
+ * \param doppler_step - Frequency bin of the search grid [Hz].
+ */
+ void set_doppler_step(unsigned int doppler_step)
+ {
+ d_doppler_step = doppler_step;
+ }
+
+
+ /*!
+ * \brief Set tracking channel internal queue.
+ * \param channel_internal_queue - Channel's internal blocks information queue.
+ */
+ void set_channel_queue(concurrent_queue *channel_internal_queue)
+ {
+ d_channel_internal_queue = channel_internal_queue;
+ }
+
+ /*!
+ * \brief Parallel Code Phase Search Acquisition signal processing.
+ */
+ int general_work(int noutput_items, gr_vector_int &ninput_items,
+ gr_vector_const_void_star &input_items,
+ gr_vector_void_star &output_items);
+
+ void acquisition_core_volk();
+
+ void acquisition_core_opencl();
+};
+
+#endif /* GNSS_SDR_pcps_opencl_acquisition_cc_H_*/
diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc
index 5adc1aaec..1eb3b3f09 100644
--- a/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc
+++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc
@@ -71,7 +71,6 @@ pcps_tong_acquisition_cc_sptr pcps_tong_make_acquisition_cc(
tong_init_val, tong_max_val, queue, dump, dump_filename));
}
-
pcps_tong_acquisition_cc::pcps_tong_acquisition_cc(
unsigned int sampled_ms, unsigned int doppler_max,
long freq, long fs_in, int samples_per_ms,
@@ -103,7 +102,7 @@ pcps_tong_acquisition_cc::pcps_tong_acquisition_cc(
//todo: do something if posix_memalign fails
if (posix_memalign((void**)&d_fft_codes, 16, d_fft_size * sizeof(gr_complex)) == 0){};
- if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(gr_complex)) == 0){};
+ if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(float)) == 0){};
// Direct FFT
d_fft_if = new gr::fft::fft_complex(d_fft_size, true);
@@ -116,19 +115,15 @@ pcps_tong_acquisition_cc::pcps_tong_acquisition_cc(
d_dump_filename = dump_filename;
}
-
pcps_tong_acquisition_cc::~pcps_tong_acquisition_cc()
{
-
- for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++)
- {
- free(d_grid_doppler_wipeoffs[doppler_index]);
- free(d_grid_data[doppler_index]);
- }
-
-
if (d_num_doppler_bins > 0)
{
+ for (unsigned int i = 0; i < d_num_doppler_bins; i++)
+ {
+ free(d_grid_doppler_wipeoffs[i]);
+ free(d_grid_data[i]);
+ }
delete[] d_grid_doppler_wipeoffs;
delete[] d_grid_data;
}
@@ -145,7 +140,6 @@ pcps_tong_acquisition_cc::~pcps_tong_acquisition_cc()
}
}
-
void pcps_tong_acquisition_cc::set_local_code(std::complex * code)
{
memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size);
@@ -163,7 +157,6 @@ void pcps_tong_acquisition_cc::set_local_code(std::complex * code)
}
}
-
void pcps_tong_acquisition_cc::init()
{
d_gnss_synchro->Acq_delay_samples = 0.0;
@@ -172,12 +165,16 @@ void pcps_tong_acquisition_cc::init()
d_mag = 0.0;
d_input_power = 0.0;
- // Create the carrier Doppler wipeoff signals
- d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step);
- for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step)
+ // Count the number of bins
+ d_num_doppler_bins = 0;
+ for (int doppler = (int)(-d_doppler_max);
+ doppler <= (int)d_doppler_max;
+ doppler += d_doppler_step)
{
d_num_doppler_bins++;
}
+
+ // Create the carrier Doppler wipeoff signals and allocate data grid.
d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins];
d_grid_data = new float*[d_num_doppler_bins];
for (unsigned int doppler_index=0;doppler_indexexecute();
- // Search maximum
+ // Compute magnitude
volk_32fc_magnitude_squared_32f_a(d_magnitude, d_ifft->get_outbuf(), d_fft_size);
+ // Compute vector of test statistics corresponding to current doppler index.
volk_32f_s32f_multiply_32f_a(d_magnitude, d_magnitude,
1/(fft_normalization_factor*fft_normalization_factor*d_input_power),
d_fft_size);
+ // Accumulate test statistics in d_grid_data.
volk_32f_x2_add_32f_a(d_grid_data[doppler_index], d_magnitude, d_grid_data[doppler_index], d_fft_size);
+ // Search maximum
volk_32f_index_max_16u_a(&indext, d_grid_data[doppler_index], d_fft_size);
- // Normalize the maximum value to correct the scale factor introduced by FFTW
magt = d_grid_data[doppler_index][indext];
// 4- record the maximum peak and the associated synchronization parameters
@@ -328,7 +326,6 @@ int pcps_tong_acquisition_cc::general_work(int noutput_items,
}
// 5- Compute the test statistics and compare to the threshold
- //d_test_statistics = 2 * d_fft_size * d_mag / d_input_power;
d_test_statistics = d_mag;
if (d_test_statistics > d_threshold*d_well_count)
diff --git a/src/algorithms/libs/CMakeLists.txt b/src/algorithms/libs/CMakeLists.txt
index 58617d726..25d3b314f 100644
--- a/src/algorithms/libs/CMakeLists.txt
+++ b/src/algorithms/libs/CMakeLists.txt
@@ -25,16 +25,31 @@
# pass_through.cc
#)
#else(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-set(GNSS_SPLIBS_SOURCES
- galileo_e1_signal_processing.cc
- gnss_sdr_valve.cc
- gnss_signal_processing.cc
- gps_sdr_signal_processing.cc
- nco_lib.cc
- pass_through.cc
-)
#endif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+if(OPENCL_FOUND)
+ set(GNSS_SPLIBS_SOURCES
+ galileo_e1_signal_processing.cc
+ gnss_sdr_valve.cc
+ gnss_signal_processing.cc
+ gps_sdr_signal_processing.cc
+ nco_lib.cc
+ pass_through.cc
+ fft_execute.cc # Needs OpenCL
+ fft_setup.cc # Needs OpenCL
+ fft_kernelstring.cc # Needs OpenCL
+ )
+else(OPENCL_FOUND)
+ set(GNSS_SPLIBS_SOURCES
+ galileo_e1_signal_processing.cc
+ gnss_sdr_valve.cc
+ gnss_signal_processing.cc
+ gps_sdr_signal_processing.cc
+ nco_lib.cc
+ pass_through.cc
+ )
+endif(OPENCL_FOUND)
+
include_directories(
$(CMAKE_CURRENT_SOURCE_DIR)
${CMAKE_SOURCE_DIR}/src/core/system_parameters
@@ -45,5 +60,10 @@ include_directories(
${GFlags_INCLUDE_DIRS}
)
+if(OPENCL_FOUND)
+ include_directories( ${OPENCL_INCLUDE_DIRS} )
+ set(OPT_LIBRARIES ${OPT_LIBRARIES} ${OPENCL_LIBRARIES})
+endif(OPENCL_FOUND)
+
add_library(gnss_sp_libs ${GNSS_SPLIBS_SOURCES})
-target_link_libraries(gnss_sp_libs ${GNURADIO_RUNTIME_LIBRARIES} ${GNURADIO_BLOCKS_LIBRARIES} ${GNURADIO_FFT_LIBRARIES} ${GNURADIO_FILTER_LIBRARIES} gnss_rx)
+target_link_libraries(gnss_sp_libs ${GNURADIO_RUNTIME_LIBRARIES} ${GNURADIO_BLOCKS_LIBRARIES} ${GNURADIO_FFT_LIBRARIES} ${GNURADIO_FILTER_LIBRARIES} ${OPT_LIBRARIES} gnss_rx)
diff --git a/src/algorithms/libs/clFFT.h b/src/algorithms/libs/clFFT.h
new file mode 100644
index 000000000..994d67613
--- /dev/null
+++ b/src/algorithms/libs/clFFT.h
@@ -0,0 +1,134 @@
+
+//
+// File: clFFT.h
+//
+// Version: <1.0>
+//
+// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
+// in consideration of your agreement to the following terms, and your use,
+// installation, modification or redistribution of this Apple software
+// constitutes acceptance of these terms. If you do not agree with these
+// terms, please do not use, install, modify or redistribute this Apple
+// software.
+//
+// In consideration of your agreement to abide by the following terms, and
+// subject to these terms, Apple grants you a personal, non - exclusive
+// license, under Apple's copyrights in this original Apple software ( the
+// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+// Software, with or without modifications, in source and / or binary forms;
+// provided that if you redistribute the Apple Software in its entirety and
+// without modifications, you must retain this notice and the following text
+// and disclaimers in all such redistributions of the Apple Software. Neither
+// the name, trademarks, service marks or logos of Apple Inc. may be used to
+// endorse or promote products derived from the Apple Software without specific
+// prior written permission from Apple. Except as expressly stated in this
+// notice, no other rights or licenses, express or implied, are granted by
+// Apple herein, including but not limited to any patent rights that may be
+// infringed by your derivative works or by other works in which the Apple
+// Software may be incorporated.
+//
+// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
+// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifndef __CLFFT_H
+#define __CLFFT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include
+
+#ifdef APPLE
+ #include
+#else
+ #include
+#endif
+
+// XForm type
+typedef enum
+{
+ clFFT_Forward = -1,
+ clFFT_Inverse = 1
+
+}clFFT_Direction;
+
+// XForm dimension
+typedef enum
+{
+ clFFT_1D = 0,
+ clFFT_2D = 1,
+ clFFT_3D = 3
+
+}clFFT_Dimension;
+
+// XForm Data type
+typedef enum
+{
+ clFFT_SplitComplexFormat = 0,
+ clFFT_InterleavedComplexFormat = 1
+}clFFT_DataFormat;
+
+typedef struct
+{
+ unsigned int x;
+ unsigned int y;
+ unsigned int z;
+}clFFT_Dim3;
+
+typedef struct
+{
+ float *real;
+ float *imag;
+} clFFT_SplitComplex;
+
+typedef struct
+{
+ float real;
+ float imag;
+}clFFT_Complex;
+
+typedef void* clFFT_Plan;
+
+clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code );
+
+void clFFT_DestroyPlan( clFFT_Plan plan );
+
+cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
+ cl_mem data_in, cl_mem data_out,
+ cl_int num_events, cl_event *event_list, cl_event *event );
+
+cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir,
+ cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
+ cl_int num_events, cl_event *event_list, cl_event *event );
+
+cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
+ size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
+
+
+cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
+ size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir);
+
+void clFFT_DumpPlan( clFFT_Plan plan, FILE *file);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/algorithms/libs/fft_base_kernels.h b/src/algorithms/libs/fft_base_kernels.h
new file mode 100644
index 000000000..101795697
--- /dev/null
+++ b/src/algorithms/libs/fft_base_kernels.h
@@ -0,0 +1,277 @@
+
+//
+// File: fft_base_kernels.h
+//
+// Version: <1.0>
+//
+// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
+// in consideration of your agreement to the following terms, and your use,
+// installation, modification or redistribution of this Apple software
+// constitutes acceptance of these terms. If you do not agree with these
+// terms, please do not use, install, modify or redistribute this Apple
+// software.
+//
+// In consideration of your agreement to abide by the following terms, and
+// subject to these terms, Apple grants you a personal, non - exclusive
+// license, under Apple's copyrights in this original Apple software ( the
+// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+// Software, with or without modifications, in source and / or binary forms;
+// provided that if you redistribute the Apple Software in its entirety and
+// without modifications, you must retain this notice and the following text
+// and disclaimers in all such redistributions of the Apple Software. Neither
+// the name, trademarks, service marks or logos of Apple Inc. may be used to
+// endorse or promote products derived from the Apple Software without specific
+// prior written permission from Apple. Except as expressly stated in this
+// notice, no other rights or licenses, express or implied, are granted by
+// Apple herein, including but not limited to any patent rights that may be
+// infringed by your derivative works or by other works in which the Apple
+// Software may be incorporated.
+//
+// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
+// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifndef __CL_FFT_BASE_KERNELS_
+#define __CL_FFT_BASE_KERNELS_
+
+#include
+
+using namespace std;
+
+static string baseKernels = string(
+ "#ifndef M_PI\n"
+ "#define M_PI 0x1.921fb54442d18p+1\n"
+ "#endif\n"
+ "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n"
+ "#define conj(a) ((float2)((a).x, -(a).y))\n"
+ "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n"
+ "\n"
+ "#define fftKernel2(a,dir) \\\n"
+ "{ \\\n"
+ " float2 c = (a)[0]; \\\n"
+ " (a)[0] = c + (a)[1]; \\\n"
+ " (a)[1] = c - (a)[1]; \\\n"
+ "}\n"
+ "\n"
+ "#define fftKernel2S(d1,d2,dir) \\\n"
+ "{ \\\n"
+ " float2 c = (d1); \\\n"
+ " (d1) = c + (d2); \\\n"
+ " (d2) = c - (d2); \\\n"
+ "}\n"
+ "\n"
+ "#define fftKernel4(a,dir) \\\n"
+ "{ \\\n"
+ " fftKernel2S((a)[0], (a)[2], dir); \\\n"
+ " fftKernel2S((a)[1], (a)[3], dir); \\\n"
+ " fftKernel2S((a)[0], (a)[1], dir); \\\n"
+ " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
+ " fftKernel2S((a)[2], (a)[3], dir); \\\n"
+ " float2 c = (a)[1]; \\\n"
+ " (a)[1] = (a)[2]; \\\n"
+ " (a)[2] = c; \\\n"
+ "}\n"
+ "\n"
+ "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n"
+ "{ \\\n"
+ " fftKernel2S((a0), (a2), dir); \\\n"
+ " fftKernel2S((a1), (a3), dir); \\\n"
+ " fftKernel2S((a0), (a1), dir); \\\n"
+ " (a3) = (float2)(dir)*(conjTransp((a3))); \\\n"
+ " fftKernel2S((a2), (a3), dir); \\\n"
+ " float2 c = (a1); \\\n"
+ " (a1) = (a2); \\\n"
+ " (a2) = c; \\\n"
+ "}\n"
+ "\n"
+ "#define bitreverse8(a) \\\n"
+ "{ \\\n"
+ " float2 c; \\\n"
+ " c = (a)[1]; \\\n"
+ " (a)[1] = (a)[4]; \\\n"
+ " (a)[4] = c; \\\n"
+ " c = (a)[3]; \\\n"
+ " (a)[3] = (a)[6]; \\\n"
+ " (a)[6] = c; \\\n"
+ "}\n"
+ "\n"
+ "#define fftKernel8(a,dir) \\\n"
+ "{ \\\n"
+ " const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n"
+ " const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n"
+ " float2 c; \\\n"
+ " fftKernel2S((a)[0], (a)[4], dir); \\\n"
+ " fftKernel2S((a)[1], (a)[5], dir); \\\n"
+ " fftKernel2S((a)[2], (a)[6], dir); \\\n"
+ " fftKernel2S((a)[3], (a)[7], dir); \\\n"
+ " (a)[5] = complexMul(w1, (a)[5]); \\\n"
+ " (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n"
+ " (a)[7] = complexMul(w3, (a)[7]); \\\n"
+ " fftKernel2S((a)[0], (a)[2], dir); \\\n"
+ " fftKernel2S((a)[1], (a)[3], dir); \\\n"
+ " fftKernel2S((a)[4], (a)[6], dir); \\\n"
+ " fftKernel2S((a)[5], (a)[7], dir); \\\n"
+ " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n"
+ " (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n"
+ " fftKernel2S((a)[0], (a)[1], dir); \\\n"
+ " fftKernel2S((a)[2], (a)[3], dir); \\\n"
+ " fftKernel2S((a)[4], (a)[5], dir); \\\n"
+ " fftKernel2S((a)[6], (a)[7], dir); \\\n"
+ " bitreverse8((a)); \\\n"
+ "}\n"
+ "\n"
+ "#define bitreverse4x4(a) \\\n"
+ "{ \\\n"
+ " float2 c; \\\n"
+ " c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n"
+ " c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n"
+ " c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n"
+ " c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n"
+ " c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n"
+ " c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n"
+ "}\n"
+ "\n"
+ "#define fftKernel16(a,dir) \\\n"
+ "{ \\\n"
+ " const float w0 = 0x1.d906bcp-1f; \\\n"
+ " const float w1 = 0x1.87de2ap-2f; \\\n"
+ " const float w2 = 0x1.6a09e6p-1f; \\\n"
+ " fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n"
+ " fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n"
+ " fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n"
+ " fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n"
+ " (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n"
+ " (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n"
+ " (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n"
+ " (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n"
+ " (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n"
+ " (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n"
+ " (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n"
+ " (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n"
+ " (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n"
+ " fftKernel4((a), dir); \\\n"
+ " fftKernel4((a) + 4, dir); \\\n"
+ " fftKernel4((a) + 8, dir); \\\n"
+ " fftKernel4((a) + 12, dir); \\\n"
+ " bitreverse4x4((a)); \\\n"
+ "}\n"
+ "\n"
+ "#define bitreverse32(a) \\\n"
+ "{ \\\n"
+ " float2 c1, c2; \\\n"
+ " c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n"
+ " c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n"
+ " c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n"
+ " c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n"
+ " c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n"
+ " c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n"
+ "}\n"
+ "\n"
+ "#define fftKernel32(a,dir) \\\n"
+ "{ \\\n"
+ " fftKernel2S((a)[0], (a)[16], dir); \\\n"
+ " fftKernel2S((a)[1], (a)[17], dir); \\\n"
+ " fftKernel2S((a)[2], (a)[18], dir); \\\n"
+ " fftKernel2S((a)[3], (a)[19], dir); \\\n"
+ " fftKernel2S((a)[4], (a)[20], dir); \\\n"
+ " fftKernel2S((a)[5], (a)[21], dir); \\\n"
+ " fftKernel2S((a)[6], (a)[22], dir); \\\n"
+ " fftKernel2S((a)[7], (a)[23], dir); \\\n"
+ " fftKernel2S((a)[8], (a)[24], dir); \\\n"
+ " fftKernel2S((a)[9], (a)[25], dir); \\\n"
+ " fftKernel2S((a)[10], (a)[26], dir); \\\n"
+ " fftKernel2S((a)[11], (a)[27], dir); \\\n"
+ " fftKernel2S((a)[12], (a)[28], dir); \\\n"
+ " fftKernel2S((a)[13], (a)[29], dir); \\\n"
+ " fftKernel2S((a)[14], (a)[30], dir); \\\n"
+ " fftKernel2S((a)[15], (a)[31], dir); \\\n"
+ " (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
+ " (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
+ " (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
+ " (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
+ " (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
+ " (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
+ " (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
+ " (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n"
+ " (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n"
+ " (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n"
+ " (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n"
+ " (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n"
+ " (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n"
+ " (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n"
+ " (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n"
+ " fftKernel16((a), dir); \\\n"
+ " fftKernel16((a) + 16, dir); \\\n"
+ " bitreverse32((a)); \\\n"
+ "}\n\n"
+ );
+
+static string twistKernelInterleaved = string(
+ "__kernel void \\\n"
+ "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
+ "{ \\\n"
+ " float2 a, w; \\\n"
+ " float ang; \\\n"
+ " unsigned int j; \\\n"
+ " unsigned int i = get_global_id(0); \\\n"
+ " unsigned int startIndex = i; \\\n"
+ " \\\n"
+ " if(i < numCols) \\\n"
+ " { \\\n"
+ " for(j = 0; j < numRowsToProcess; j++) \\\n"
+ " { \\\n"
+ " a = in[startIndex]; \\\n"
+ " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
+ " w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
+ " a = complexMul(a, w); \\\n"
+ " in[startIndex] = a; \\\n"
+ " startIndex += numCols; \\\n"
+ " } \\\n"
+ " } \\\n"
+ "} \\\n"
+ );
+
+static string twistKernelPlannar = string(
+ "__kernel void \\\n"
+ "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n"
+ "{ \\\n"
+ " float2 a, w; \\\n"
+ " float ang; \\\n"
+ " unsigned int j; \\\n"
+ " unsigned int i = get_global_id(0); \\\n"
+ " unsigned int startIndex = i; \\\n"
+ " \\\n"
+ " if(i < numCols) \\\n"
+ " { \\\n"
+ " for(j = 0; j < numRowsToProcess; j++) \\\n"
+ " { \\\n"
+ " a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n"
+ " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n"
+ " w = (float2)(native_cos(ang), native_sin(ang)); \\\n"
+ " a = complexMul(a, w); \\\n"
+ " in_real[startIndex] = a.x; \\\n"
+ " in_imag[startIndex] = a.y; \\\n"
+ " startIndex += numCols; \\\n"
+ " } \\\n"
+ " } \\\n"
+ "} \\\n"
+ );
+
+
+
+#endif
diff --git a/src/algorithms/libs/fft_execute.cc b/src/algorithms/libs/fft_execute.cc
new file mode 100644
index 000000000..6416e163e
--- /dev/null
+++ b/src/algorithms/libs/fft_execute.cc
@@ -0,0 +1,405 @@
+
+//
+// File: fft_execute.cpp
+//
+// Version: <1.0>
+//
+// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
+// in consideration of your agreement to the following terms, and your use,
+// installation, modification or redistribution of this Apple software
+// constitutes acceptance of these terms. If you do not agree with these
+// terms, please do not use, install, modify or redistribute this Apple
+// software.¬
+//
+// In consideration of your agreement to abide by the following terms, and
+// subject to these terms, Apple grants you a personal, non - exclusive
+// license, under Apple's copyrights in this original Apple software ( the
+// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+// Software, with or without modifications, in source and / or binary forms;
+// provided that if you redistribute the Apple Software in its entirety and
+// without modifications, you must retain this notice and the following text
+// and disclaimers in all such redistributions of the Apple Software. Neither
+// the name, trademarks, service marks or logos of Apple Inc. may be used to
+// endorse or promote products derived from the Apple Software without specific
+// prior written permission from Apple. Except as expressly stated in this
+// notice, no other rights or licenses, express or implied, are granted by
+// Apple herein, including but not limited to any patent rights that may be
+// infringed by your derivative works or by other works in which the Apple
+// Software may be incorporated.
+//
+// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
+// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include "fft_internal.h"
+#include "clFFT.h"
+#include
+#include
+#include
+
+#define max(a,b) (((a)>(b)) ? (a) : (b))
+#define min(a,b) (((a)<(b)) ? (a) : (b))
+
+static cl_int
+allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize)
+{
+ cl_int err = CL_SUCCESS;
+ if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
+ {
+ plan->last_batch_size = batchSize;
+ size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float);
+
+ if(plan->tempmemobj)
+ clReleaseMemObject(plan->tempmemobj);
+
+ plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
+ }
+ return err;
+}
+
+static cl_int
+allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize)
+{
+ cl_int err = CL_SUCCESS;
+ cl_int terr;
+ if(plan->temp_buffer_needed && plan->last_batch_size != batchSize)
+ {
+ plan->last_batch_size = batchSize;
+ size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float);
+
+ if(plan->tempmemobj_real)
+ clReleaseMemObject(plan->tempmemobj_real);
+
+ if(plan->tempmemobj_imag)
+ clReleaseMemObject(plan->tempmemobj_imag);
+
+ plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err);
+ plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr);
+ err |= terr;
+ }
+ return err;
+}
+
+void
+getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems)
+{
+ *lWorkItems = kernelInfo->num_workitems_per_workgroup;
+ int numWorkGroups = kernelInfo->num_workgroups;
+ int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup;
+
+ switch(kernelInfo->dir)
+ {
+ case cl_fft_kernel_x:
+ *batchSize *= (plan->n.y * plan->n.z);
+ numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG);
+ numWorkGroups *= kernelInfo->num_workgroups;
+ break;
+ case cl_fft_kernel_y:
+ *batchSize *= plan->n.z;
+ numWorkGroups *= *batchSize;
+ break;
+ case cl_fft_kernel_z:
+ numWorkGroups *= *batchSize;
+ break;
+ }
+
+ *gWorkItems = numWorkGroups * *lWorkItems;
+}
+
+cl_int
+clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
+ cl_mem data_in, cl_mem data_out,
+ cl_int num_events, cl_event *event_list, cl_event *event )
+{
+ int s;
+ cl_fft_plan *plan = (cl_fft_plan *) Plan;
+ if(plan->format != clFFT_InterleavedComplexFormat)
+ return CL_INVALID_VALUE;
+
+ cl_int err;
+ size_t gWorkItems, lWorkItems;
+ int inPlaceDone;
+
+ cl_int isInPlace = data_in == data_out ? 1 : 0;
+
+ if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS)
+ return err;
+
+ cl_mem memObj[3];
+ memObj[0] = data_in;
+ memObj[1] = data_out;
+ memObj[2] = plan->tempmemobj;
+ cl_fft_kernel_info *kernelInfo = plan->kernel_info;
+ int numKernels = plan->num_kernels;
+
+ int numKernelsOdd = numKernels & 1;
+ int currRead = 0;
+ int currWrite = 1;
+
+ // at least one external dram shuffle (transpose) required
+ if(plan->temp_buffer_needed)
+ {
+ // in-place transform
+ if(isInPlace)
+ {
+ inPlaceDone = 0;
+ currRead = 1;
+ currWrite = 2;
+ }
+ else
+ {
+ currWrite = (numKernels & 1) ? 1 : 2;
+ }
+
+ while(kernelInfo)
+ {
+ if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
+ {
+ currWrite = currRead;
+ inPlaceDone = 1;
+ }
+
+ s = batchSize;
+ getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+ err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
+ err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
+ err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
+ err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
+
+ err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+ if(err)
+ return err;
+
+ currRead = (currWrite == 1) ? 1 : 2;
+ currWrite = (currWrite == 1) ? 2 : 1;
+
+ kernelInfo = kernelInfo->next;
+ }
+ }
+ // no dram shuffle (transpose required) transform
+ // all kernels can execute in-place.
+ else {
+
+ while(kernelInfo)
+ {
+ s = batchSize;
+ getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+ err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]);
+ err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]);
+ err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir);
+ err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s);
+
+ err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+ if(err)
+ return err;
+
+ currRead = 1;
+ currWrite = 1;
+
+ kernelInfo = kernelInfo->next;
+ }
+ }
+
+ return err;
+}
+
+cl_int
+clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir,
+ cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag,
+ cl_int num_events, cl_event *event_list, cl_event *event)
+{
+ int s;
+ cl_fft_plan *plan = (cl_fft_plan *) Plan;
+
+ if(plan->format != clFFT_SplitComplexFormat)
+ return CL_INVALID_VALUE;
+
+ cl_int err;
+ size_t gWorkItems, lWorkItems;
+ int inPlaceDone;
+
+ cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0;
+
+ if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS)
+ return err;
+
+ cl_mem memObj_real[3];
+ cl_mem memObj_imag[3];
+ memObj_real[0] = data_in_real;
+ memObj_real[1] = data_out_real;
+ memObj_real[2] = plan->tempmemobj_real;
+ memObj_imag[0] = data_in_imag;
+ memObj_imag[1] = data_out_imag;
+ memObj_imag[2] = plan->tempmemobj_imag;
+
+ cl_fft_kernel_info *kernelInfo = plan->kernel_info;
+ int numKernels = plan->num_kernels;
+
+ int numKernelsOdd = numKernels & 1;
+ int currRead = 0;
+ int currWrite = 1;
+
+ // at least one external dram shuffle (transpose) required
+ if(plan->temp_buffer_needed)
+ {
+ // in-place transform
+ if(isInPlace)
+ {
+ inPlaceDone = 0;
+ currRead = 1;
+ currWrite = 2;
+ }
+ else
+ {
+ currWrite = (numKernels & 1) ? 1 : 2;
+ }
+
+ while(kernelInfo)
+ {
+ if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible)
+ {
+ currWrite = currRead;
+ inPlaceDone = 1;
+ }
+
+ s = batchSize;
+ getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+ err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
+ err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
+ err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
+ err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
+ err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
+ err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
+
+ err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+ if(err)
+ return err;
+
+ currRead = (currWrite == 1) ? 1 : 2;
+ currWrite = (currWrite == 1) ? 2 : 1;
+
+ kernelInfo = kernelInfo->next;
+ }
+ }
+ // no dram shuffle (transpose required) transform
+ else {
+
+ while(kernelInfo)
+ {
+ s = batchSize;
+ getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems);
+ err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]);
+ err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]);
+ err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]);
+ err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]);
+ err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir);
+ err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s);
+
+ err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL);
+ if(err)
+ return err;
+
+ currRead = 1;
+ currWrite = 1;
+
+ kernelInfo = kernelInfo->next;
+ }
+ }
+
+ return err;
+}
+
+cl_int
+clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array,
+ unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
+{
+ cl_fft_plan *plan = (cl_fft_plan *) Plan;
+
+ unsigned int N = numRows*numCols;
+ unsigned int nCols = numCols;
+ unsigned int sRow = startRow;
+ unsigned int rToProcess = rowsToProcess;
+ int d = dir;
+ int err = 0;
+
+ cl_device_id device_id;
+ err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
+ if(err)
+ return err;
+
+ size_t gSize;
+ err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
+ if(err)
+ return err;
+
+ gSize = min(128, gSize);
+ size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
+ size_t numLocalThreads[1] = { gSize };
+
+ err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array);
+ err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow);
+ err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols);
+ err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N);
+ err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess);
+ err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d);
+
+ err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
+
+ return err;
+}
+
+cl_int
+clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag,
+ unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir)
+{
+ cl_fft_plan *plan = (cl_fft_plan *) Plan;
+
+ unsigned int N = numRows*numCols;
+ unsigned int nCols = numCols;
+ unsigned int sRow = startRow;
+ unsigned int rToProcess = rowsToProcess;
+ int d = dir;
+ int err = 0;
+
+ cl_device_id device_id;
+ err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL);
+ if(err)
+ return err;
+
+ size_t gSize;
+ err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL);
+ if(err)
+ return err;
+
+ gSize = min(128, gSize);
+ size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize };
+ size_t numLocalThreads[1] = { gSize };
+
+ err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real);
+ err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag);
+ err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow);
+ err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols);
+ err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N);
+ err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess);
+ err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d);
+
+ err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL);
+
+ return err;
+}
+
diff --git a/src/algorithms/libs/fft_internal.h b/src/algorithms/libs/fft_internal.h
new file mode 100644
index 000000000..5b797a3a0
--- /dev/null
+++ b/src/algorithms/libs/fft_internal.h
@@ -0,0 +1,163 @@
+
+//
+// File: fft_internal.h
+//
+// Version: <1.0>
+//
+// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
+// in consideration of your agreement to the following terms, and your use,
+// installation, modification or redistribution of this Apple software
+// constitutes acceptance of these terms. If you do not agree with these
+// terms, please do not use, install, modify or redistribute this Apple
+// software.
+//
+// In consideration of your agreement to abide by the following terms, and
+// subject to these terms, Apple grants you a personal, non - exclusive
+// license, under Apple's copyrights in this original Apple software ( the
+// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+// Software, with or without modifications, in source and / or binary forms;
+// provided that if you redistribute the Apple Software in its entirety and
+// without modifications, you must retain this notice and the following text
+// and disclaimers in all such redistributions of the Apple Software. Neither
+// the name, trademarks, service marks or logos of Apple Inc. may be used to
+// endorse or promote products derived from the Apple Software without specific
+// prior written permission from Apple. Except as expressly stated in this
+// notice, no other rights or licenses, express or implied, are granted by
+// Apple herein, including but not limited to any patent rights that may be
+// infringed by your derivative works or by other works in which the Apple
+// Software may be incorporated.
+//
+// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
+// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#ifndef __CLFFT_INTERNAL_H
+#define __CLFFT_INTERNAL_H
+
+#include "clFFT.h"
+#include
+#include
+#include
+
+using namespace std;
+
+typedef enum kernel_dir_t
+{
+ cl_fft_kernel_x,
+ cl_fft_kernel_y,
+ cl_fft_kernel_z
+}cl_fft_kernel_dir;
+
+typedef struct kernel_info_t
+{
+ cl_kernel kernel;
+ char *kernel_name;
+ unsigned lmem_size;
+ unsigned num_workgroups;
+ unsigned num_xforms_per_workgroup;
+ unsigned num_workitems_per_workgroup;
+ cl_fft_kernel_dir dir;
+ int in_place_possible;
+ kernel_info_t *next;
+}cl_fft_kernel_info;
+
+typedef struct
+{
+ // context in which fft resources are created and kernels are executed
+ cl_context context;
+
+ // size of signal
+ clFFT_Dim3 n;
+
+ // dimension of transform ... must be either 1D, 2D or 3D
+ clFFT_Dimension dim;
+
+ // data format ... must be either interleaved or plannar
+ clFFT_DataFormat format;
+
+ // string containing kernel source. Generated at runtime based on
+ // n, dim, format and other parameters
+ string *kernel_string;
+
+ // CL program containing source and kernel this particular
+ // n, dim, data format
+ cl_program program;
+
+ // linked list of kernels which needs to be executed for this fft
+ cl_fft_kernel_info *kernel_info;
+
+ // number of kernels
+ int num_kernels;
+
+ // twist kernel for virtualizing fft of very large sizes that do not
+ // fit in GPU global memory
+ cl_kernel twist_kernel;
+
+ // flag indicating if temporary intermediate buffer is needed or not.
+ // this depends on fft kernels being executed and if transform is
+ // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ...
+ // one that does not require global transpose do not need temporary buffer)
+ // 2D 1024x1024 out-of-place fft however do require intermediate buffer.
+ // If temp buffer is needed, its allocation is lazy i.e. its not allocated
+ // until its needed
+ cl_int temp_buffer_needed;
+
+ // Batch size is runtime parameter and size of temporary buffer (if needed)
+ // depends on batch size. Allocation of temporary buffer is lazy i.e. its
+ // only created when needed. Once its created at first call of clFFT_Executexxx
+ // it is not allocated next time if next time clFFT_Executexxx is called with
+ // batch size different than the first call. last_batch_size caches the last
+ // batch size with which this plan is used so that we dont keep allocating/deallocating
+ // temp buffer if same batch size is used again and again.
+ unsigned last_batch_size;
+
+ // temporary buffer for interleaved plan
+ cl_mem tempmemobj;
+
+ // temporary buffer for planner plan. Only one of tempmemobj or
+ // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending
+ // data format of plan (plannar or interleaved)
+ cl_mem tempmemobj_real, tempmemobj_imag;
+
+ // Maximum size of signal for which local memory transposed based
+ // fft is sufficient i.e. no global mem transpose (communication)
+ // is needed
+ unsigned max_localmem_fft_size;
+
+ // Maximum work items per work group allowed. This, along with max_radix below controls
+ // maximum local memory being used by fft kernels of this plan. Set to 256 by default
+ unsigned max_work_item_per_workgroup;
+
+ // Maximum base radix for local memory fft ... this controls the maximum register
+ // space used by work items. Currently defaults to 16
+ unsigned max_radix;
+
+ // Device depended parameter that tells how many work-items need to be read consecutive
+ // values to make sure global memory access by work-items of a work-group result in
+ // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16
+ unsigned min_mem_coalesce_width;
+
+ // Number of local memory banks. This is used to geneate kernel with local memory
+ // transposes with appropriate padding to avoid bank conflicts to local memory
+ // e.g. on NVidia it is 16.
+ unsigned num_local_mem_banks;
+}cl_fft_plan;
+
+void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir);
+
+#endif
diff --git a/src/algorithms/libs/fft_kernelstring.cc b/src/algorithms/libs/fft_kernelstring.cc
new file mode 100644
index 000000000..7847ce959
--- /dev/null
+++ b/src/algorithms/libs/fft_kernelstring.cc
@@ -0,0 +1,1257 @@
+
+//
+// File: fft_kernelstring.cpp
+//
+// Version: <1.0>
+//
+// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
+// in consideration of your agreement to the following terms, and your use,
+// installation, modification or redistribution of this Apple software
+// constitutes acceptance of these terms. If you do not agree with these
+// terms, please do not use, install, modify or redistribute this Apple
+// software.
+//
+// In consideration of your agreement to abide by the following terms, and
+// subject to these terms, Apple grants you a personal, non - exclusive
+// license, under Apple's copyrights in this original Apple software ( the
+// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+// Software, with or without modifications, in source and / or binary forms;
+// provided that if you redistribute the Apple Software in its entirety and
+// without modifications, you must retain this notice and the following text
+// and disclaimers in all such redistributions of the Apple Software. Neither
+// the name, trademarks, service marks or logos of Apple Inc. may be used to
+// endorse or promote products derived from the Apple Software without specific
+// prior written permission from Apple. Except as expressly stated in this
+// notice, no other rights or licenses, express or implied, are granted by
+// Apple herein, including but not limited to any patent rights that may be
+// infringed by your derivative works or by other works in which the Apple
+// Software may be incorporated.
+//
+// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
+// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include "fft_internal.h"
+#include "clFFT.h"
+
+using namespace std;
+
+#define max(A,B) ((A) > (B) ? (A) : (B))
+#define min(A,B) ((A) < (B) ? (A) : (B))
+
+static string
+num2str(int num)
+{
+ char temp[200];
+ sprintf(temp, "%d", num);
+ return string(temp);
+}
+
+// For any n, this function decomposes n into factors for loacal memory tranpose
+// based fft. Factors (radices) are sorted such that the first one (radixArray[0])
+// is the largest. This base radix determines the number of registers used by each
+// work item and product of remaining radices determine the size of work group needed.
+// To make things concrete with and example, suppose n = 1024. It is decomposed into
+// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and
+// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length
+// 16 ffts (64 work items working in parallel) following by transpose using local
+// memory followed by again 64 length 16 ffts followed by transpose using local memory
+// followed by 256 length 4 ffts. For the last step since with size of work group is
+// 64 and each work item can array for 16 values, 64 work items can compute 256 length
+// 4 ffts by each work item computing 4 length 4 ffts.
+// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work
+// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed
+// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
+// by transpose using local memory, followed by 256 length 8 in-register ffts, followed
+// by transpose using local memory, followed by 512 length 4 in-register ffts. Again,
+// for the last step, each work item computes two length 4 in-register ffts and thus
+// 256 work items are needed to compute all 512 ffts.
+// For n = 32 = 8 x 4, 4 work items first compute 4 in-register
+// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register
+// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items
+// can compute 8 length 4 ffts. However if work group size of say 64 is choosen,
+// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform).
+// Users can play with these parameters to figure what gives best performance on
+// their particular device i.e. some device have less register space thus using
+// smaller base radix can avoid spilling ... some has small local memory thus
+// using smaller work group size may be required etc
+
+static void
+getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix)
+{
+ if(maxRadix > 1)
+ {
+ maxRadix = min(n, maxRadix);
+ unsigned int cnt = 0;
+ while(n > maxRadix)
+ {
+ radixArray[cnt++] = maxRadix;
+ n /= maxRadix;
+ }
+ radixArray[cnt++] = n;
+ *numRadices = cnt;
+ return;
+ }
+
+ switch(n)
+ {
+ case 2:
+ *numRadices = 1;
+ radixArray[0] = 2;
+ break;
+
+ case 4:
+ *numRadices = 1;
+ radixArray[0] = 4;
+ break;
+
+ case 8:
+ *numRadices = 1;
+ radixArray[0] = 8;
+ break;
+
+ case 16:
+ *numRadices = 2;
+ radixArray[0] = 8; radixArray[1] = 2;
+ break;
+
+ case 32:
+ *numRadices = 2;
+ radixArray[0] = 8; radixArray[1] = 4;
+ break;
+
+ case 64:
+ *numRadices = 2;
+ radixArray[0] = 8; radixArray[1] = 8;
+ break;
+
+ case 128:
+ *numRadices = 3;
+ radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4;
+ break;
+
+ case 256:
+ *numRadices = 4;
+ radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4;
+ break;
+
+ case 512:
+ *numRadices = 3;
+ radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8;
+ break;
+
+ case 1024:
+ *numRadices = 3;
+ radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4;
+ break;
+ case 2048:
+ *numRadices = 4;
+ radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4;
+ break;
+ default:
+ *numRadices = 0;
+ return;
+ }
+}
+
+static void
+insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat)
+{
+ if(dataFormat == clFFT_SplitComplexFormat)
+ kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n");
+ else
+ kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n");
+}
+
+static void
+insertVariables(string &kStream, int maxRadix)
+{
+ kStream += string(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n");
+ kStream += string(" int s, ii, jj, offset;\n");
+ kStream += string(" float2 w;\n");
+ kStream += string(" float ang, angf, ang1;\n");
+ kStream += string(" __local float *lMemStore, *lMemLoad;\n");
+ kStream += string(" float2 a[") + num2str(maxRadix) + string("];\n");
+ kStream += string(" int lId = get_local_id( 0 );\n");
+ kStream += string(" int groupId = get_group_id( 0 );\n");
+}
+
+static void
+formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
+{
+ if(dataFormat == clFFT_InterleavedComplexFormat)
+ kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n");
+ else
+ {
+ kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n");
+ kernelString += string(" a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n");
+ }
+}
+
+static void
+formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat)
+{
+ if(dataFormat == clFFT_InterleavedComplexFormat)
+ kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n");
+ else
+ {
+ kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n");
+ kernelString += string(" out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n");
+ }
+}
+
+static int
+insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat)
+{
+ int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm);
+ int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
+ int i, j;
+ int lMemSize = 0;
+
+ if(numXFormsPerWG > 1)
+ kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n");
+
+ if(numWorkItemsPerXForm >= mem_coalesce_width)
+ {
+ if(numXFormsPerWG > 1)
+ {
+ kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n");
+ kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
+ kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
+ kernelString += string(" offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n");
+ if(dataFormat == clFFT_InterleavedComplexFormat)
+ {
+ kernelString += string(" in += offset;\n");
+ kernelString += string(" out += offset;\n");
+ }
+ else
+ {
+ kernelString += string(" in_real += offset;\n");
+ kernelString += string(" in_imag += offset;\n");
+ kernelString += string(" out_real += offset;\n");
+ kernelString += string(" out_imag += offset;\n");
+ }
+ for(i = 0; i < R0; i++)
+ formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
+ kernelString += string(" }\n");
+ }
+ else
+ {
+ kernelString += string(" ii = lId;\n");
+ kernelString += string(" jj = 0;\n");
+ kernelString += string(" offset = mad24(groupId, ") + num2str(N) + string(", ii);\n");
+ if(dataFormat == clFFT_InterleavedComplexFormat)
+ {
+ kernelString += string(" in += offset;\n");
+ kernelString += string(" out += offset;\n");
+ }
+ else
+ {
+ kernelString += string(" in_real += offset;\n");
+ kernelString += string(" in_imag += offset;\n");
+ kernelString += string(" out_real += offset;\n");
+ kernelString += string(" out_imag += offset;\n");
+ }
+ for(i = 0; i < R0; i++)
+ formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat);
+ }
+ }
+ else if( N >= mem_coalesce_width )
+ {
+ int numInnerIter = N / mem_coalesce_width;
+ int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
+
+ kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
+ kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
+ kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+ kernelString += string(" offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n");
+ kernelString += string(" offset = mad24( offset, ") + num2str(N) + string(", ii );\n");
+ if(dataFormat == clFFT_InterleavedComplexFormat)
+ {
+ kernelString += string(" in += offset;\n");
+ kernelString += string(" out += offset;\n");
+ }
+ else
+ {
+ kernelString += string(" in_real += offset;\n");
+ kernelString += string(" in_imag += offset;\n");
+ kernelString += string(" out_real += offset;\n");
+ kernelString += string(" out_imag += offset;\n");
+ }
+
+ kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
+ for(i = 0; i < numOuterIter; i++ )
+ {
+ kernelString += string(" if( jj < s ) {\n");
+ for(j = 0; j < numInnerIter; j++ )
+ formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);
+ kernelString += string(" }\n");
+ if(i != numOuterIter - 1)
+ kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
+ }
+ kernelString += string("}\n ");
+ kernelString += string("else {\n");
+ for(i = 0; i < numOuterIter; i++ )
+ {
+ for(j = 0; j < numInnerIter; j++ )
+ formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat);
+ }
+ kernelString += string("}\n");
+
+ kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
+ kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
+ kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n");
+
+ for( i = 0; i < numOuterIter; i++ )
+ {
+ for( j = 0; j < numInnerIter; j++ )
+ {
+ kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") +
+ num2str(i * numInnerIter + j) + string("].x;\n");
+ }
+ }
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < R0; i++ )
+ kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < numOuterIter; i++ )
+ {
+ for( j = 0; j < numInnerIter; j++ )
+ {
+ kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") +
+ num2str(i * numInnerIter + j) + string("].y;\n");
+ }
+ }
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < R0; i++ )
+ kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
+ }
+ else
+ {
+ kernelString += string(" offset = mad24( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n");
+ if(dataFormat == clFFT_InterleavedComplexFormat)
+ {
+ kernelString += string(" in += offset;\n");
+ kernelString += string(" out += offset;\n");
+ }
+ else
+ {
+ kernelString += string(" in_real += offset;\n");
+ kernelString += string(" in_imag += offset;\n");
+ kernelString += string(" out_real += offset;\n");
+ kernelString += string(" out_imag += offset;\n");
+ }
+
+ kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n");
+ kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n");
+ kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+
+ kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
+ for( i = 0; i < R0; i++ )
+ {
+ kernelString += string(" if(jj < s )\n");
+ formattedLoad(kernelString, i, i*groupSize, dataFormat);
+ if(i != R0 - 1)
+ kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n");
+ }
+ kernelString += string("}\n");
+ kernelString += string("else {\n");
+ for( i = 0; i < R0; i++ )
+ {
+ formattedLoad(kernelString, i, i*groupSize, dataFormat);
+ }
+ kernelString += string("}\n");
+
+ if(numWorkItemsPerXForm > 1)
+ {
+ kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n");
+ kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n");
+ kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+ }
+ else
+ {
+ kernelString += string(" ii = 0;\n");
+ kernelString += string(" jj = lId;\n");
+ kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n");
+ }
+
+
+ for( i = 0; i < R0; i++ )
+ kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < R0; i++ )
+ kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < R0; i++ )
+ kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < R0; i++ )
+ kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
+ }
+
+ return lMemSize;
+}
+
+static int
+insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat)
+{
+ int groupSize = numWorkItemsPerXForm * numXFormsPerWG;
+ int i, j, k, ind;
+ int lMemSize = 0;
+ int numIter = maxRadix / Nr;
+ string indent = string("");
+
+ if( numWorkItemsPerXForm >= mem_coalesce_width )
+ {
+ if(numXFormsPerWG > 1)
+ {
+ kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n");
+ indent = string(" ");
+ }
+ for(i = 0; i < maxRadix; i++)
+ {
+ j = i % numIter;
+ k = i / numIter;
+ ind = j * Nr + k;
+ formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat);
+ }
+ if(numXFormsPerWG > 1)
+ kernelString += string(" }\n");
+ }
+ else if( N >= mem_coalesce_width )
+ {
+ int numInnerIter = N / mem_coalesce_width;
+ int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width );
+
+ kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+ kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n");
+ kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n");
+ kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+
+ for( i = 0; i < maxRadix; i++ )
+ {
+ j = i % numIter;
+ k = i / numIter;
+ ind = j * Nr + k;
+ kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");
+ }
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < numOuterIter; i++ )
+ for( j = 0; j < numInnerIter; j++ )
+ kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < maxRadix; i++ )
+ {
+ j = i % numIter;
+ k = i / numIter;
+ ind = j * Nr + k;
+ kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");
+ }
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < numOuterIter; i++ )
+ for( j = 0; j < numInnerIter; j++ )
+ kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
+ for(i = 0; i < numOuterIter; i++ )
+ {
+ kernelString += string(" if( jj < s ) {\n");
+ for(j = 0; j < numInnerIter; j++ )
+ formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat);
+ kernelString += string(" }\n");
+ if(i != numOuterIter - 1)
+ kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n");
+ }
+ kernelString += string("}\n");
+ kernelString += string("else {\n");
+ for(i = 0; i < numOuterIter; i++ )
+ {
+ for(j = 0; j < numInnerIter; j++ )
+ formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat);
+ }
+ kernelString += string("}\n");
+
+ lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
+ }
+ else
+ {
+ kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+
+ kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n");
+ kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n");
+ kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n");
+
+ for( i = 0; i < maxRadix; i++ )
+ {
+ j = i % numIter;
+ k = i / numIter;
+ ind = j * Nr + k;
+ kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n");
+ }
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < maxRadix; i++ )
+ kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < maxRadix; i++ )
+ {
+ j = i % numIter;
+ k = i / numIter;
+ ind = j * Nr + k;
+ kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n");
+ }
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ for( i = 0; i < maxRadix; i++ )
+ kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n");
+ kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n");
+
+ kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n");
+ for( i = 0; i < maxRadix; i++ )
+ {
+ kernelString += string(" if(jj < s ) {\n");
+ formattedStore(kernelString, i, i*groupSize, dataFormat);
+ kernelString += string(" }\n");
+ if( i != maxRadix - 1)
+ kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n");
+ }
+ kernelString += string("}\n");
+ kernelString += string("else {\n");
+ for( i = 0; i < maxRadix; i++ )
+ {
+ formattedStore(kernelString, i, i*groupSize, dataFormat);
+ }
+ kernelString += string("}\n");
+
+ lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG;
+ }
+
+ return lMemSize;
+}
+
+static void
+insertfftKernel(string &kernelString, int Nr, int numIter)
+{
+ int i;
+ for(i = 0; i < numIter; i++)
+ {
+ kernelString += string(" fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n");
+ }
+}
+
+static void
+insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm)
+{
+ int z, k;
+ int logNPrev = (int)log2(Nprev);
+
+ for(z = 0; z < numIter; z++)
+ {
+ if(z == 0)
+ {
+ if(Nprev > 1)
+ kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n");
+ else
+ kernelString += string(" angf = (float) ii;\n");
+ }
+ else
+ {
+ if(Nprev > 1)
+ kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n");
+ else
+ kernelString += string(" angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n");
+ }
+
+ for(k = 1; k < Nr; k++) {
+ int ind = z*Nr + k;
+ //float fac = (float) (2.0 * M_PI * (double) k / (double) len);
+ kernelString += string(" ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n");
+ kernelString += string(" w = (float2)(native_cos(ang), native_sin(ang));\n");
+ kernelString += string(" a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n");
+ }
+ }
+}
+
+static int
+getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad)
+{
+ if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks))
+ *offset = 0;
+ else {
+ int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev;
+ int numColsReq = 1;
+ if(numRowsReq > Nr)
+ numColsReq = numRowsReq / Nr;
+ numColsReq = Nprev * numColsReq;
+ *offset = numColsReq;
+ }
+
+ if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1)
+ *midPad = 0;
+ else {
+ int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1);
+ if( bankNum >= numWorkItemsPerXForm )
+ *midPad = 0;
+ else
+ *midPad = numWorkItemsPerXForm - bankNum;
+ }
+
+ int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1);
+ return lMemSize;
+}
+
+
+static void
+insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
+{
+ int z, k;
+
+ for(z = 0; z < numIter; z++) {
+ for(k = 0; k < Nr; k++) {
+ int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm;
+ kernelString += string(" lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n");
+ }
+ }
+ kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
+}
+
+static void
+insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp)
+{
+ int numWorkItemsReqN = n / Nrn;
+ int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 );
+ int interBlockHStride = numWorkItemsPerXForm;
+ int vertWidth = max(numWorkItemsPerXForm / Nprev, 1);
+ vertWidth = min( vertWidth, Nr);
+ int vertNum = Nr / vertWidth;
+ int vertStride = ( n / Nr + offset ) * vertWidth;
+ int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1);
+ int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1;
+ intraBlockHStride *= Nprev;
+
+ int stride = numWorkItemsReq / Nrn;
+ int i;
+ for(i = 0; i < iter; i++) {
+ int ii = i / (interBlockHNum * vertNum);
+ int zz = i % (interBlockHNum * vertNum);
+ int jj = zz % interBlockHNum;
+ int kk = zz / interBlockHNum;
+ int z;
+ for(z = 0; z < Nrn; z++) {
+ int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride;
+ kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n");
+ }
+ }
+ kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n");
+}
+
+static void
+insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad)
+{
+ int Ncurr = Nprev * Nr;
+ int logNcurr = (int)log2(Ncurr);
+ int logNprev = (int)log2(Nprev);
+ int incr = (numWorkItemsReq + offset) * Nr + midPad;
+
+ if(Ncurr < numWorkItemsPerXForm)
+ {
+ if(Nprev == 1)
+ kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n");
+ else
+ kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n");
+
+ if(Nprev == 1)
+ kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n");
+ else
+ kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n");
+ }
+ else
+ {
+ if(Nprev == 1)
+ kernelString += string(" j = ii;\n");
+ else
+ kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n");
+ if(Nprev == 1)
+ kernelString += string(" i = 0;\n");
+ else
+ kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n");
+ }
+
+ if(numXFormsPerWG > 1)
+ kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n");
+
+ kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n");
+}
+
+static void
+insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad)
+{
+ if(numXFormsPerWG == 1) {
+ kernelString += string(" lMemStore = sMem + ii;\n");
+ }
+ else {
+ kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n");
+ }
+}
+
+
+static void
+createLocalMemfftKernelString(cl_fft_plan *plan)
+{
+ unsigned int radixArray[10];
+ unsigned int numRadix;
+
+ unsigned int n = plan->n.x;
+
+ assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n");
+
+ getRadixArray(n, radixArray, &numRadix, 0);
+ assert(numRadix > 0 && "no radix array supplied\n");
+
+ if(n/radixArray[0] > plan->max_work_item_per_workgroup)
+ getRadixArray(n, radixArray, &numRadix, plan->max_radix);
+
+ assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n");
+ assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n");
+
+ unsigned int tmpLen = 1;
+ unsigned int i;
+ for(i = 0; i < numRadix; i++)
+ {
+ assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) );
+ tmpLen *= radixArray[i];
+ }
+ assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n");
+
+ int offset, midPad;
+ string localString(""), kernelName("");
+
+ clFFT_DataFormat dataFormat = plan->format;
+ string *kernelString = plan->kernel_string;
+
+
+ cl_fft_kernel_info **kInfo = &plan->kernel_info;
+ int kCount = 0;
+
+ while(*kInfo)
+ {
+ kInfo = &(*kInfo)->next;
+ kCount++;
+ }
+
+ kernelName = string("fft") + num2str(kCount);
+
+ *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
+ (*kInfo)->kernel = 0;
+ (*kInfo)->lmem_size = 0;
+ (*kInfo)->num_workgroups = 0;
+ (*kInfo)->num_workitems_per_workgroup = 0;
+ (*kInfo)->dir = cl_fft_kernel_x;
+ (*kInfo)->in_place_possible = 1;
+ (*kInfo)->next = NULL;
+ (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
+ strcpy((*kInfo)->kernel_name, kernelName.c_str());
+
+ unsigned int numWorkItemsPerXForm = n / radixArray[0];
+ unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm;
+ assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup);
+ int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm;
+ (*kInfo)->num_workgroups = 1;
+ (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG;
+ (*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG;
+
+ unsigned int *N = radixArray;
+ unsigned int maxRadix = N[0];
+ unsigned int lMemSize = 0;
+
+ insertVariables(localString, maxRadix);
+
+ lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat);
+ (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
+
+ string xcomp = string("x");
+ string ycomp = string("y");
+
+ unsigned int Nprev = 1;
+ unsigned int len = n;
+ unsigned int r;
+ for(r = 0; r < numRadix; r++)
+ {
+ int numIter = N[0] / N[r];
+ int numWorkItemsReq = n / N[r];
+ int Ncurr = Nprev * N[r];
+ insertfftKernel(localString, N[r], numIter);
+
+ if(r < (numRadix - 1)) {
+ insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm);
+ lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad);
+ (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
+ insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad);
+ insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad);
+ insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
+ insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp);
+ insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
+ insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp);
+ Nprev = Ncurr;
+ len = len / N[r];
+ }
+ }
+
+ lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat);
+ (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size;
+
+ insertHeader(*kernelString, kernelName, dataFormat);
+ *kernelString += string("{\n");
+ if((*kInfo)->lmem_size)
+ *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
+ *kernelString += localString;
+ *kernelString += string("}\n");
+}
+
+// For n larger than what can be computed using local memory fft, global transposes
+// multiple kernel launces is needed. For these sizes, n can be decomposed using
+// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel
+// launches will be needed, first computing 64 x 32, length 128 ffts, second computing
+// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts.
+// Each of these base radices can futher be divided into factors so that each of these
+// base ffts can be computed within one kernel launch using in-register ffts and local
+// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length
+// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length
+// 16 ffts followed by transpose using local memory followed by each of these eight
+// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This
+// means only 8 work items are needed for computing one length 128 fft. If we choose
+// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one
+// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this
+// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each
+// work group where each work group is computing 8 length 128 ffts where each length
+// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels
+// in this example. Users can play with difference base radices and difference
+// decompositions of base radices to generates different kernels and see which gives
+// best performance. Following function is just fixed to use 128 as base radix
+
+void
+getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices)
+{
+ int baseRadix = min(n, 128);
+
+ int numR = 0;
+ int N = n;
+ while(N > baseRadix)
+ {
+ N /= baseRadix;
+ numR++;
+ }
+
+ for(int i = 0; i < numR; i++)
+ radix[i] = baseRadix;
+
+ radix[numR] = N;
+ numR++;
+ *numRadices = numR;
+
+ for(int i = 0; i < numR; i++)
+ {
+ int B = radix[i];
+ if(B <= 8)
+ {
+ R1[i] = B;
+ R2[i] = 1;
+ continue;
+ }
+
+ int r1 = 2;
+ int r2 = B / r1;
+ while(r2 > r1)
+ {
+ r1 *=2;
+ r2 = B / r1;
+ }
+ R1[i] = r1;
+ R2[i] = r2;
+ }
+}
+
+static void
+createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS)
+{
+ int i, j, k, t;
+ int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ int radix, R1, R2;
+ int numRadices;
+
+ int maxThreadsPerBlock = plan->max_work_item_per_workgroup;
+ int maxArrayLen = plan->max_radix;
+ int batchSize = plan->min_mem_coalesce_width;
+ clFFT_DataFormat dataFormat = plan->format;
+ int vertical = (dir == cl_fft_kernel_x) ? 0 : 1;
+
+ getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices);
+
+ int numPasses = numRadices;
+
+ string localString(""), kernelName("");
+ string *kernelString = plan->kernel_string;
+ cl_fft_kernel_info **kInfo = &plan->kernel_info;
+ int kCount = 0;
+
+ while(*kInfo)
+ {
+ kInfo = &(*kInfo)->next;
+ kCount++;
+ }
+
+ int N = n;
+ int m = (int)log2(n);
+ int Rinit = vertical ? BS : 1;
+ batchSize = vertical ? min(BS, batchSize) : batchSize;
+ int passNum;
+
+ for(passNum = 0; passNum < numPasses; passNum++)
+ {
+
+ localString.clear();
+ kernelName.clear();
+
+ radix = radixArr[passNum];
+ R1 = R1Arr[passNum];
+ R2 = R2Arr[passNum];
+
+ int strideI = Rinit;
+ for(i = 0; i < numPasses; i++)
+ if(i != passNum)
+ strideI *= radixArr[i];
+
+ int strideO = Rinit;
+ for(i = 0; i < passNum; i++)
+ strideO *= radixArr[i];
+
+ int threadsPerXForm = R2;
+ batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize;
+ batchSize = min(batchSize, strideI);
+ int threadsPerBlock = batchSize * threadsPerXForm;
+ threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock);
+ batchSize = threadsPerBlock / threadsPerXForm;
+ assert(R2 <= R1);
+ assert(R1*R2 == radix);
+ assert(R1 <= maxArrayLen);
+ assert(threadsPerBlock <= maxThreadsPerBlock);
+
+ int numIter = R1 / R2;
+ int gInInc = threadsPerBlock / batchSize;
+
+
+ int lgStrideO = (int)log2(strideO);
+ int numBlocksPerXForm = strideI / batchSize;
+ int numBlocks = numBlocksPerXForm;
+ if(!vertical)
+ numBlocks *= BS;
+ else
+ numBlocks *= vertBS;
+
+ kernelName = string("fft") + num2str(kCount);
+ *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info));
+ (*kInfo)->kernel = 0;
+ if(R2 == 1)
+ (*kInfo)->lmem_size = 0;
+ else
+ {
+ if(strideO == 1)
+ (*kInfo)->lmem_size = (radix + 1)*batchSize;
+ else
+ (*kInfo)->lmem_size = threadsPerBlock*R1;
+ }
+ (*kInfo)->num_workgroups = numBlocks;
+ (*kInfo)->num_xforms_per_workgroup = 1;
+ (*kInfo)->num_workitems_per_workgroup = threadsPerBlock;
+ (*kInfo)->dir = dir;
+ if( (passNum == (numPasses - 1)) && (numPasses & 1) )
+ (*kInfo)->in_place_possible = 1;
+ else
+ (*kInfo)->in_place_possible = 0;
+ (*kInfo)->next = NULL;
+ (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1));
+ strcpy((*kInfo)->kernel_name, kernelName.c_str());
+
+ insertVariables(localString, R1);
+
+ if(vertical)
+ {
+ localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n");
+ localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
+ localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n");
+ localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n");
+ localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
+ localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
+ int stride = radix*Rinit;
+ for(i = 0; i < passNum; i++)
+ stride *= radixArr[i];
+ localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n");
+ localString += string("bNum = groupId;\n");
+ }
+ else
+ {
+ int lgNumBlocksPerXForm = (int)log2(numBlocksPerXForm);
+ localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n");
+ localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n");
+ localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n");
+ localString += string("tid = indexIn;\n");
+ localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n");
+ localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n");
+ int stride = radix*Rinit;
+ for(i = 0; i < passNum; i++)
+ stride *= radixArr[i];
+ localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n");
+ localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n");
+ localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n");
+ }
+
+ // Load Data
+ int lgBatchSize = (int)log2(batchSize);
+ localString += string("tid = lId;\n");
+ localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n");
+ localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n");
+ localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n");
+
+ if(dataFormat == clFFT_SplitComplexFormat)
+ {
+ localString += string("in_real += indexIn;\n");
+ localString += string("in_imag += indexIn;\n");
+ for(j = 0; j < R1; j++)
+ localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n");
+ for(j = 0; j < R1; j++)
+ localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n");
+ }
+ else
+ {
+ localString += string("in += indexIn;\n");
+ for(j = 0; j < R1; j++)
+ localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n");
+ }
+
+ localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n");
+
+ if(R2 > 1)
+ {
+ // twiddle
+ for(k = 1; k < R1; k++)
+ {
+ localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n");
+ localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
+ localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n");
+ }
+
+ // shuffle
+ numIter = R1 / R2;
+ localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n");
+ localString += string("lMemStore = sMem + tid;\n");
+ localString += string("lMemLoad = sMem + indexIn;\n");
+ for(k = 0; k < R1; k++)
+ localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
+ localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+ for(k = 0; k < numIter; k++)
+ for(t = 0; t < R2; t++)
+ localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
+ localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+ for(k = 0; k < R1; k++)
+ localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
+ localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+ for(k = 0; k < numIter; k++)
+ for(t = 0; t < R2; t++)
+ localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n");
+ localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+
+ for(j = 0; j < numIter; j++)
+ localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n");
+ }
+
+ // twiddle
+ if(passNum < (numPasses - 1))
+ {
+ localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n");
+ localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n");
+ localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n");
+ for(t = 0; t < R1; t++)
+ {
+ localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n");
+ localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n");
+ localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n");
+ }
+ }
+
+ // Store Data
+ if(strideO == 1)
+ {
+
+ localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n");
+ localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n");
+
+ for(i = 0; i < R1/R2; i++)
+ for(j = 0; j < R2; j++)
+ localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n");
+ localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+ if(threadsPerBlock >= radix)
+ {
+ for(i = 0; i < R1; i++)
+ localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
+ }
+ else
+ {
+ int innerIter = radix/threadsPerBlock;
+ int outerIter = R1/innerIter;
+ for(i = 0; i < outerIter; i++)
+ for(j = 0; j < innerIter; j++)
+ localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
+ }
+ localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+
+ for(i = 0; i < R1/R2; i++)
+ for(j = 0; j < R2; j++)
+ localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n");
+ localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+ if(threadsPerBlock >= radix)
+ {
+ for(i = 0; i < R1; i++)
+ localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n");
+ }
+ else
+ {
+ int innerIter = radix/threadsPerBlock;
+ int outerIter = R1/innerIter;
+ for(i = 0; i < outerIter; i++)
+ for(j = 0; j < innerIter; j++)
+ localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n");
+ }
+ localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n");
+
+ localString += string("indexOut += tid;\n");
+ if(dataFormat == clFFT_SplitComplexFormat) {
+ localString += string("out_real += indexOut;\n");
+ localString += string("out_imag += indexOut;\n");
+ for(k = 0; k < R1; k++)
+ localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n");
+ for(k = 0; k < R1; k++)
+ localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n");
+ }
+ else {
+ localString += string("out += indexOut;\n");
+ for(k = 0; k < R1; k++)
+ localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n");
+ }
+
+ }
+ else
+ {
+ localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n");
+ if(dataFormat == clFFT_SplitComplexFormat) {
+ localString += string("out_real += indexOut;\n");
+ localString += string("out_imag += indexOut;\n");
+ for(k = 0; k < R1; k++)
+ localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n");
+ for(k = 0; k < R1; k++)
+ localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n");
+ }
+ else {
+ localString += string("out += indexOut;\n");
+ for(k = 0; k < R1; k++)
+ localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n");
+ }
+ }
+
+ insertHeader(*kernelString, kernelName, dataFormat);
+ *kernelString += string("{\n");
+ if((*kInfo)->lmem_size)
+ *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n");
+ *kernelString += localString;
+ *kernelString += string("}\n");
+
+ N /= radix;
+ kInfo = &(*kInfo)->next;
+ kCount++;
+ }
+}
+
+void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir)
+{
+ unsigned int radixArray[10];
+ unsigned int numRadix;
+
+ switch(dir)
+ {
+ case cl_fft_kernel_x:
+ if(plan->n.x > plan->max_localmem_fft_size)
+ {
+ createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
+ }
+ else if(plan->n.x > 1)
+ {
+ getRadixArray(plan->n.x, radixArray, &numRadix, 0);
+ if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
+ {
+ createLocalMemfftKernelString(plan);
+ }
+ else
+ {
+ getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix);
+ if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup)
+ createLocalMemfftKernelString(plan);
+ else
+ createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1);
+ }
+ }
+ break;
+
+ case cl_fft_kernel_y:
+ if(plan->n.y > 1)
+ createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1);
+ break;
+
+ case cl_fft_kernel_z:
+ if(plan->n.z > 1)
+ createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1);
+ default:
+ return;
+ }
+}
+
diff --git a/src/algorithms/libs/fft_setup.cc b/src/algorithms/libs/fft_setup.cc
new file mode 100644
index 000000000..6d2025d66
--- /dev/null
+++ b/src/algorithms/libs/fft_setup.cc
@@ -0,0 +1,402 @@
+
+//
+// File: fft_setup.cpp
+//
+// Version: <1.0>
+//
+// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple")
+// in consideration of your agreement to the following terms, and your use,
+// installation, modification or redistribution of this Apple software
+// constitutes acceptance of these terms. If you do not agree with these
+// terms, please do not use, install, modify or redistribute this Apple
+// software.
+//
+// In consideration of your agreement to abide by the following terms, and
+// subject to these terms, Apple grants you a personal, non - exclusive
+// license, under Apple's copyrights in this original Apple software ( the
+// "Apple Software" ), to use, reproduce, modify and redistribute the Apple
+// Software, with or without modifications, in source and / or binary forms;
+// provided that if you redistribute the Apple Software in its entirety and
+// without modifications, you must retain this notice and the following text
+// and disclaimers in all such redistributions of the Apple Software. Neither
+// the name, trademarks, service marks or logos of Apple Inc. may be used to
+// endorse or promote products derived from the Apple Software without specific
+// prior written permission from Apple. Except as expressly stated in this
+// notice, no other rights or licenses, express or implied, are granted by
+// Apple herein, including but not limited to any patent rights that may be
+// infringed by your derivative works or by other works in which the Apple
+// Software may be incorporated.
+//
+// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO
+// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED
+// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION
+// ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+//
+// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR
+// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION
+// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER
+// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR
+// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Copyright ( C ) 2008 Apple Inc. All Rights Reserved.
+//
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+#include "fft_internal.h"
+#include "fft_base_kernels.h"
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+using namespace std;
+
+extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems);
+
+static void
+getBlockConfigAndKernelString(cl_fft_plan *plan)
+{
+ plan->temp_buffer_needed = 0;
+ *plan->kernel_string += baseKernels;
+
+ if(plan->format == clFFT_SplitComplexFormat)
+ *plan->kernel_string += twistKernelPlannar;
+ else
+ *plan->kernel_string += twistKernelInterleaved;
+
+ switch(plan->dim)
+ {
+ case clFFT_1D:
+ FFT1D(plan, cl_fft_kernel_x);
+ break;
+
+ case clFFT_2D:
+ FFT1D(plan, cl_fft_kernel_x);
+ FFT1D(plan, cl_fft_kernel_y);
+ break;
+
+ case clFFT_3D:
+ FFT1D(plan, cl_fft_kernel_x);
+ FFT1D(plan, cl_fft_kernel_y);
+ FFT1D(plan, cl_fft_kernel_z);
+ break;
+
+ default:
+ return;
+ }
+
+ plan->temp_buffer_needed = 0;
+ cl_fft_kernel_info *kInfo = plan->kernel_info;
+ while(kInfo)
+ {
+ plan->temp_buffer_needed |= !kInfo->in_place_possible;
+ kInfo = kInfo->next;
+ }
+}
+
+
+static void
+deleteKernelInfo(cl_fft_kernel_info *kInfo)
+{
+ if(kInfo)
+ {
+ if(kInfo->kernel_name)
+ free(kInfo->kernel_name);
+ if(kInfo->kernel)
+ clReleaseKernel(kInfo->kernel);
+ free(kInfo);
+ }
+}
+
+static void
+destroy_plan(cl_fft_plan *Plan)
+{
+ cl_fft_kernel_info *kernel_info = Plan->kernel_info;
+
+ while(kernel_info)
+ {
+ cl_fft_kernel_info *tmp = kernel_info->next;
+ deleteKernelInfo(kernel_info);
+ kernel_info = tmp;
+ }
+
+ Plan->kernel_info = NULL;
+
+ if(Plan->kernel_string)
+ {
+ delete Plan->kernel_string;
+ Plan->kernel_string = NULL;
+ }
+ if(Plan->twist_kernel)
+ {
+ clReleaseKernel(Plan->twist_kernel);
+ Plan->twist_kernel = NULL;
+ }
+ if(Plan->program)
+ {
+ clReleaseProgram(Plan->program);
+ Plan->program = NULL;
+ }
+ if(Plan->tempmemobj)
+ {
+ clReleaseMemObject(Plan->tempmemobj);
+ Plan->tempmemobj = NULL;
+ }
+ if(Plan->tempmemobj_real)
+ {
+ clReleaseMemObject(Plan->tempmemobj_real);
+ Plan->tempmemobj_real = NULL;
+ }
+ if(Plan->tempmemobj_imag)
+ {
+ clReleaseMemObject(Plan->tempmemobj_imag);
+ Plan->tempmemobj_imag = NULL;
+ }
+}
+
+static int
+createKernelList(cl_fft_plan *plan)
+{
+ cl_program program = plan->program;
+ cl_fft_kernel_info *kernel_info = plan->kernel_info;
+
+ cl_int err;
+ while(kernel_info)
+ {
+ kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err);
+ if(!kernel_info->kernel || err != CL_SUCCESS)
+ return err;
+ kernel_info = kernel_info->next;
+ }
+
+ if(plan->format == clFFT_SplitComplexFormat)
+ plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err);
+ else
+ plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err);
+
+ if(!plan->twist_kernel || err)
+ return err;
+
+ return CL_SUCCESS;
+}
+
+int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices)
+{
+ int reg_needed = 0;
+ *max_wg_size = std::numeric_limits::max();
+ int err;
+ unsigned wg_size;
+
+ unsigned int i;
+ for(i = 0; i < num_devices; i++)
+ {
+ cl_fft_kernel_info *kInfo = plan->kernel_info;
+ while(kInfo)
+ {
+ err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL);
+ if(err != CL_SUCCESS)
+ return -1;
+
+ if(wg_size < kInfo->num_workitems_per_workgroup)
+ reg_needed |= 1;
+
+ if(*max_wg_size > wg_size)
+ *max_wg_size = wg_size;
+
+ kInfo = kInfo->next;
+ }
+ }
+
+ return reg_needed;
+}
+
+#define ERR_MACRO(err) { \
+ if( err != CL_SUCCESS) \
+ { \
+ if(error_code) \
+ *error_code = err; \
+ clFFT_DestroyPlan((clFFT_Plan) plan); \
+ return (clFFT_Plan) NULL; \
+ } \
+ }
+
+clFFT_Plan
+clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code )
+{
+ int i;
+ cl_int err;
+ int isPow2 = 1;
+ cl_fft_plan *plan = NULL;
+ ostringstream kString;
+ int num_devices;
+ int gpu_found = 0;
+ cl_device_id devices[16];
+ size_t ret_size;
+ cl_device_type device_type;
+
+ if(!context)
+ ERR_MACRO(CL_INVALID_VALUE);
+
+ isPow2 |= n.x && !( (n.x - 1) & n.x );
+ isPow2 |= n.y && !( (n.y - 1) & n.y );
+ isPow2 |= n.z && !( (n.z - 1) & n.z );
+
+ if(!isPow2)
+ ERR_MACRO(CL_INVALID_VALUE);
+
+ if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) )
+ ERR_MACRO(CL_INVALID_VALUE);
+
+ plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan));
+ if(!plan)
+ ERR_MACRO(CL_OUT_OF_RESOURCES);
+
+ plan->context = context;
+ clRetainContext(context);
+ plan->n = n;
+ plan->dim = dim;
+ plan->format = dataFormat;
+ plan->kernel_info = 0;
+ plan->num_kernels = 0;
+ plan->twist_kernel = 0;
+ plan->program = 0;
+ plan->temp_buffer_needed = 0;
+ plan->last_batch_size = 0;
+ plan->tempmemobj = 0;
+ plan->tempmemobj_real = 0;
+ plan->tempmemobj_imag = 0;
+ plan->max_localmem_fft_size = 2048;
+ plan->max_work_item_per_workgroup = 256;
+ plan->max_radix = 16;
+ plan->min_mem_coalesce_width = 16;
+ plan->num_local_mem_banks = 16;
+
+patch_kernel_source:
+
+ plan->kernel_string = new string("");
+ if(!plan->kernel_string)
+ ERR_MACRO(CL_OUT_OF_RESOURCES);
+
+ getBlockConfigAndKernelString(plan);
+
+ const char *source_str = plan->kernel_string->c_str();
+ plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err);
+ ERR_MACRO(err);
+
+ err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size);
+ ERR_MACRO(err);
+
+ num_devices = (int)(ret_size / sizeof(cl_device_id));
+
+ for(i = 0; i < num_devices; i++)
+ {
+ err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
+ ERR_MACRO(err);
+
+ if(device_type == CL_DEVICE_TYPE_GPU)
+ {
+ gpu_found = 1;
+ err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ char *build_log;
+ char devicename[200];
+ size_t log_size;
+
+ err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+ ERR_MACRO(err);
+
+ build_log = (char *) malloc(log_size + 1);
+
+ err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
+ ERR_MACRO(err);
+
+ err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL);
+ ERR_MACRO(err);
+
+ fprintf(stdout, "FFT program build log on device %s\n", devicename);
+ fprintf(stdout, "%s\n", build_log);
+ free(build_log);
+
+ ERR_MACRO(err);
+ }
+ }
+ }
+
+ if(!gpu_found)
+ ERR_MACRO(CL_INVALID_CONTEXT);
+
+ err = createKernelList(plan);
+ ERR_MACRO(err);
+
+ // we created program and kernels based on "some max work group size (default 256)" ... this work group size
+ // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source
+ // setting this as limit i.e max group size and rebuild.
+ unsigned int max_kernel_wg_size;
+ int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices);
+ if(patching_req == -1)
+ {
+ ERR_MACRO(err);
+ }
+
+ if(patching_req)
+ {
+ destroy_plan(plan);
+ plan->max_work_item_per_workgroup = max_kernel_wg_size;
+ goto patch_kernel_source;
+ }
+
+ cl_fft_kernel_info *kInfo = plan->kernel_info;
+ while(kInfo)
+ {
+ plan->num_kernels++;
+ kInfo = kInfo->next;
+ }
+
+ if(error_code)
+ *error_code = CL_SUCCESS;
+
+ return (clFFT_Plan) plan;
+}
+
+void
+clFFT_DestroyPlan(clFFT_Plan plan)
+{
+ cl_fft_plan *Plan = (cl_fft_plan *) plan;
+ if(Plan)
+ {
+ destroy_plan(Plan);
+ clReleaseContext(Plan->context);
+ free(Plan);
+ }
+}
+
+void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file)
+{
+ size_t gDim, lDim;
+ FILE *out;
+ if(!file)
+ out = stdout;
+ else
+ out = file;
+
+ cl_fft_plan *plan = (cl_fft_plan *) Plan;
+ cl_fft_kernel_info *kInfo = plan->kernel_info;
+
+ while(kInfo)
+ {
+ cl_int s = 1;
+ getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim);
+ fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim);
+ kInfo = kInfo->next;
+ }
+ fprintf(out, "%s\n", plan->kernel_string->c_str());
+}
diff --git a/src/algorithms/libs/galileo_e1_signal_processing.cc b/src/algorithms/libs/galileo_e1_signal_processing.cc
index 9a0c2f895..2ad474e90 100644
--- a/src/algorithms/libs/galileo_e1_signal_processing.cc
+++ b/src/algorithms/libs/galileo_e1_signal_processing.cc
@@ -158,7 +158,7 @@ galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signal[3],
std::string _galileo_signal = _Signal;
unsigned int _samplesPerCode;
- const unsigned int _codeFreqBasis = Galileo_E1_CODE_CHIP_RATE_HZ; //Hz
+ const int _codeFreqBasis = Galileo_E1_CODE_CHIP_RATE_HZ; //Hz
unsigned int _codeLength = Galileo_E1_B_CODE_LENGTH_CHIPS;
int primary_code_E1_chips[(int)Galileo_E1_B_CODE_LENGTH_CHIPS];
_samplesPerCode = round(_fs / (_codeFreqBasis / _codeLength));
diff --git a/src/core/receiver/gnss_block_factory.cc b/src/core/receiver/gnss_block_factory.cc
index 6d83544d3..1050991cb 100644
--- a/src/core/receiver/gnss_block_factory.cc
+++ b/src/core/receiver/gnss_block_factory.cc
@@ -54,6 +54,7 @@
#include "fir_filter.h"
#include "freq_xlating_fir_filter.h"
#include "gps_l1_ca_pcps_acquisition.h"
+#include "gps_l1_ca_pcps_multithread_acquisition.h"
#include "gps_l1_ca_pcps_tong_acquisition.h"
#include "gps_l1_ca_pcps_assisted_acquisition.h"
#include "gps_l1_ca_pcps_acquisition_fine_doppler.h"
@@ -74,6 +75,10 @@
#include "gps_l1_ca_pvt.h"
#include "galileo_e1_pvt.h"
+#if OPENCL
+ #include "gps_l1_ca_pcps_opencl_acquisition.h"
+#endif
+
#if GN3S_DRIVER
#include "gn3s_signal_source.h"
#endif
@@ -346,9 +351,18 @@ GNSSBlockInterface* GNSSBlockFactory::GetBlock(
}
else if (implementation.compare("GPS_L1_CA_PCPS_Multithread_Acquisition") == 0)
{
- block = new GpsL1CaPcpsAcquisition(configuration, role, in_streams,
+ block = new GpsL1CaPcpsMultithreadAcquisition(configuration, role, in_streams,
out_streams, queue);
}
+
+#if OPENCL
+ else if (implementation.compare("GPS_L1_CA_PCPS_OpenCl_Acquisition") == 0)
+ {
+ block = new GpsL1CaPcpsOpenClAcquisition(configuration, role, in_streams,
+ out_streams, queue);
+ }
+#endif
+
else if (implementation.compare("GPS_L1_CA_PCPS_Acquisition_Fine_Doppler") == 0)
{
block = new GpsL1CaPcpsAcquisitionFineDoppler(configuration, role, in_streams,
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 8dbb1cc4c..04d079c46 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -129,6 +129,7 @@ add_executable(gnss_block_test EXCLUDE_FROM_ALL
${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_acquisition_test.cc
${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc
${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc
+ ${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc
${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc
${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/galileo_e1_pcps_ambiguous_acquisition_test.cc
${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc_test.cc
diff --git a/src/tests/gnss_block/galileo_e1_pcps_8ms_ambiguous_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/galileo_e1_pcps_8ms_ambiguous_acquisition_gsoc2013_test.cc
index d8272f0f1..041665ee6 100644
--- a/src/tests/gnss_block/galileo_e1_pcps_8ms_ambiguous_acquisition_gsoc2013_test.cc
+++ b/src/tests/gnss_block/galileo_e1_pcps_8ms_ambiguous_acquisition_gsoc2013_test.cc
@@ -238,14 +238,14 @@ void GalileoE1Pcps8msAmbiguousAcquisitionGSoC2013Test::config_2()
config->set_property("SignalSource.doppler_Hz_1", "1000");
config->set_property("SignalSource.delay_chips_1", "100");
- config->set_property("SignalSource.system_2", "G");
- config->set_property("SignalSource.PRN_2", "10");
+ config->set_property("SignalSource.system_2", "E");
+ config->set_property("SignalSource.PRN_2", "21");
config->set_property("SignalSource.CN0_dB_2", "44");
config->set_property("SignalSource.doppler_Hz_2", "2000");
config->set_property("SignalSource.delay_chips_2", "200");
- config->set_property("SignalSource.system_3", "G");
- config->set_property("SignalSource.PRN_3", "20");
+ config->set_property("SignalSource.system_3", "E");
+ config->set_property("SignalSource.PRN_3", "22");
config->set_property("SignalSource.CN0_dB_3", "44");
config->set_property("SignalSource.doppler_Hz_3", "3000");
config->set_property("SignalSource.delay_chips_3", "300");
@@ -279,7 +279,7 @@ void GalileoE1Pcps8msAmbiguousAcquisitionGSoC2013Test::config_2()
std::to_string(integration_time_ms));
config->set_property("Acquisition.max_dwells", "1");
config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_8ms_Ambiguous_Acquisition");
- config->set_property("Acquisition.pfa", "1e-1");
+ config->set_property("Acquisition.pfa", "0.1");
config->set_property("Acquisition.doppler_max", "10000");
config->set_property("Acquisition.doppler_step", "250");
config->set_property("Acquisition.dump", "false");
diff --git a/src/tests/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc2013_test.cc
index 8229a4596..00be9fb53 100644
--- a/src/tests/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc2013_test.cc
+++ b/src/tests/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc2013_test.cc
@@ -242,14 +242,14 @@ void GalileoE1PcpsAmbiguousAcquisitionGSoC2013Test::config_2()
config->set_property("SignalSource.doppler_Hz_1", "1000");
config->set_property("SignalSource.delay_chips_1", "100");
- config->set_property("SignalSource.system_2", "G");
- config->set_property("SignalSource.PRN_2", "10");
+ config->set_property("SignalSource.system_2", "E");
+ config->set_property("SignalSource.PRN_2", "21");
config->set_property("SignalSource.CN0_dB_2", "44");
config->set_property("SignalSource.doppler_Hz_2", "2000");
config->set_property("SignalSource.delay_chips_2", "200");
- config->set_property("SignalSource.system_3", "G");
- config->set_property("SignalSource.PRN_3", "20");
+ config->set_property("SignalSource.system_3", "E");
+ config->set_property("SignalSource.PRN_3", "22");
config->set_property("SignalSource.CN0_dB_3", "44");
config->set_property("SignalSource.doppler_Hz_3", "3000");
config->set_property("SignalSource.delay_chips_3", "300");
@@ -284,7 +284,7 @@ void GalileoE1PcpsAmbiguousAcquisitionGSoC2013Test::config_2()
config->set_property("Acquisition.max_dwells", "1");
config->set_property("Acquisition.bit_transition_flag","false");
config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_Ambiguous_Acquisition");
- config->set_property("Acquisition.pfa", "1e-1");
+ config->set_property("Acquisition.pfa", "0.1");
config->set_property("Acquisition.doppler_max", "10000");
config->set_property("Acquisition.doppler_step", "250");
config->set_property("Acquisition.dump", "false");
diff --git a/src/tests/gnss_block/galileo_e1_pcps_cccwsr_ambiguous_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/galileo_e1_pcps_cccwsr_ambiguous_acquisition_gsoc2013_test.cc
index 99cbe8f52..cddb19a39 100644
--- a/src/tests/gnss_block/galileo_e1_pcps_cccwsr_ambiguous_acquisition_gsoc2013_test.cc
+++ b/src/tests/gnss_block/galileo_e1_pcps_cccwsr_ambiguous_acquisition_gsoc2013_test.cc
@@ -240,14 +240,14 @@ void GalileoE1PcpsCccwsrAmbiguousAcquisitionTest::config_2()
config->set_property("SignalSource.doppler_Hz_1", "1000");
config->set_property("SignalSource.delay_chips_1", "100");
- config->set_property("SignalSource.system_2", "G");
- config->set_property("SignalSource.PRN_2", "10");
+ config->set_property("SignalSource.system_2", "E");
+ config->set_property("SignalSource.PRN_2", "21");
config->set_property("SignalSource.CN0_dB_2", "44");
config->set_property("SignalSource.doppler_Hz_2", "2000");
config->set_property("SignalSource.delay_chips_2", "200");
- config->set_property("SignalSource.system_3", "G");
- config->set_property("SignalSource.PRN_3", "20");
+ config->set_property("SignalSource.system_3", "E");
+ config->set_property("SignalSource.PRN_3", "22");
config->set_property("SignalSource.CN0_dB_3", "44");
config->set_property("SignalSource.doppler_Hz_3", "3000");
config->set_property("SignalSource.delay_chips_3", "300");
@@ -281,7 +281,7 @@ void GalileoE1PcpsCccwsrAmbiguousAcquisitionTest::config_2()
std::to_string(integration_time_ms));
config->set_property("Acquisition.max_dwells", "1");
config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_CCCWSR_Ambiguous_Acquisition");
- config->set_property("Acquisition.threshold", "0.0025");
+ config->set_property("Acquisition.threshold", "0.00215"); // Pfa,a = 0.1
config->set_property("Acquisition.doppler_max", "10000");
config->set_property("Acquisition.doppler_step", "250");
config->set_property("Acquisition.dump", "false");
@@ -536,7 +536,7 @@ TEST_F(GalileoE1PcpsCccwsrAmbiguousAcquisitionTest, ValidationOfResultsProbabili
top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0);
}) << "Failure connecting the blocks of acquisition test." << std::endl;
- std::cout << "Probability of false alarm (target) = " << 0.0065 << std::endl;
+ std::cout << "Probability of false alarm (target) = " << 0.1 << std::endl;
// i = 0 --> sallite in acquisition is visible (prob of detection and prob of detection with wrong estimation)
// i = 1 --> satellite in acquisition is not visible (prob of false detection)
diff --git a/src/tests/gnss_block/galileo_e1_pcps_tong_ambiguous_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/galileo_e1_pcps_tong_ambiguous_acquisition_gsoc2013_test.cc
index 7db6cf877..86aa1cc08 100644
--- a/src/tests/gnss_block/galileo_e1_pcps_tong_ambiguous_acquisition_gsoc2013_test.cc
+++ b/src/tests/gnss_block/galileo_e1_pcps_tong_ambiguous_acquisition_gsoc2013_test.cc
@@ -191,8 +191,8 @@ void GalileoE1PcpsTongAmbiguousAcquisitionGSoC2013Test::config_1()
config->set_property("Acquisition.if", "0");
config->set_property("Acquisition.coherent_integration_time_ms",
std::to_string(integration_time_ms));
- config->set_property("Acquisition.tong_init_val", "5");
- config->set_property("Acquisition.tong_max_val", "10");
+ config->set_property("Acquisition.tong_init_val", "1");
+ config->set_property("Acquisition.tong_max_val", "8");
config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_Tong_Ambiguous_Acquisition");
config->set_property("Acquisition.threshold", "0.3");
config->set_property("Acquisition.doppler_max", "10000");
@@ -241,14 +241,14 @@ void GalileoE1PcpsTongAmbiguousAcquisitionGSoC2013Test::config_2()
config->set_property("SignalSource.doppler_Hz_1", "1000");
config->set_property("SignalSource.delay_chips_1", "100");
- config->set_property("SignalSource.system_2", "G");
- config->set_property("SignalSource.PRN_2", "10");
+ config->set_property("SignalSource.system_2", "E");
+ config->set_property("SignalSource.PRN_2", "21");
config->set_property("SignalSource.CN0_dB_2", "44");
config->set_property("SignalSource.doppler_Hz_2", "2000");
config->set_property("SignalSource.delay_chips_2", "200");
- config->set_property("SignalSource.system_3", "G");
- config->set_property("SignalSource.PRN_3", "20");
+ config->set_property("SignalSource.system_3", "E");
+ config->set_property("SignalSource.PRN_3", "22");
config->set_property("SignalSource.CN0_dB_3", "44");
config->set_property("SignalSource.doppler_Hz_3", "3000");
config->set_property("SignalSource.delay_chips_3", "300");
@@ -280,10 +280,10 @@ void GalileoE1PcpsTongAmbiguousAcquisitionGSoC2013Test::config_2()
config->set_property("Acquisition.if", "0");
config->set_property("Acquisition.coherent_integration_time_ms",
std::to_string(integration_time_ms));
- config->set_property("Acquisition.tong_init_val", "5");
- config->set_property("Acquisition.tong_max_val", "10");
+ config->set_property("Acquisition.tong_init_val", "1");
+ config->set_property("Acquisition.tong_max_val", "8");
config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_Tong_Ambiguous_Acquisition");
- config->set_property("Acquisition.threshold", "0.0005");
+ config->set_property("Acquisition.threshold", "0.00028"); // Pfa,a = 0.1
config->set_property("Acquisition.doppler_max", "10000");
config->set_property("Acquisition.doppler_step", "250");
config->set_property("Acquisition.dump", "false");
@@ -536,7 +536,7 @@ TEST_F(GalileoE1PcpsTongAmbiguousAcquisitionGSoC2013Test, ValidationOfResultsPro
top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0);
}) << "Failure connecting the blocks of acquisition test." << std::endl;
- std::cout << "Probability of false alarm (target) = " << 0.0 << std::endl;
+ std::cout << "Probability of false alarm (target) = " << 0.1 << std::endl;
// i = 0 --> sallite in acquisition is visible (prob of detection and prob of detection with wrong estimation)
// i = 1 --> satellite in acquisition is not visible (prob of false detection)
diff --git a/src/tests/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc
index 2a8c4e009..6c2ea1833 100644
--- a/src/tests/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc
+++ b/src/tests/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc
@@ -238,13 +238,13 @@ void GpsL1CaPcpsAcquisitionGSoC2013Test::config_2()
config->set_property("SignalSource.doppler_Hz_1", "1000");
config->set_property("SignalSource.delay_chips_1", "100");
- config->set_property("SignalSource.system_2", "E");
+ config->set_property("SignalSource.system_2", "G");
config->set_property("SignalSource.PRN_2", "21");
config->set_property("SignalSource.CN0_dB_2", "44");
config->set_property("SignalSource.doppler_Hz_2", "2000");
config->set_property("SignalSource.delay_chips_2", "200");
- config->set_property("SignalSource.system_3", "E");
+ config->set_property("SignalSource.system_3", "G");
config->set_property("SignalSource.PRN_3", "22");
config->set_property("SignalSource.CN0_dB_3", "44");
config->set_property("SignalSource.doppler_Hz_3", "3000");
@@ -279,7 +279,7 @@ void GpsL1CaPcpsAcquisitionGSoC2013Test::config_2()
std::to_string(integration_time_ms));
config->set_property("Acquisition.max_dwells", "1");
config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_Acquisition");
- config->set_property("Acquisition.pfa", "1e-1");
+ config->set_property("Acquisition.pfa", "0.1");
config->set_property("Acquisition.doppler_max", "10000");
config->set_property("Acquisition.doppler_step", "250");
config->set_property("Acquisition.bit_transition_flag", "false");
diff --git a/src/tests/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc
index 4a1917af5..a65aa39c3 100644
--- a/src/tests/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc
+++ b/src/tests/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc
@@ -1,5 +1,5 @@
/*!
- * \file gps_l1_ca_pcps_acquisition_gsoc2013_test.cc
+ * \file gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc
* \brief This class implements an acquisition test for
* GpsL1CaPcpsMultithreadAcquisition class.
* \author Marc Molina, 2013. marc.molina.pena(at)gmail.com
@@ -205,7 +205,7 @@ void GpsL1CaPcpsMultithreadAcquisitionGSoC2013Test::config_2()
std::string signal = "1C";
signal.copy(gnss_synchro.Signal,2,0);
- integration_time_ms = 4;
+ integration_time_ms = 1;
fs_in = 4e6;
expected_delay_chips = 600;
@@ -237,13 +237,13 @@ void GpsL1CaPcpsMultithreadAcquisitionGSoC2013Test::config_2()
config->set_property("SignalSource.doppler_Hz_1", "1000");
config->set_property("SignalSource.delay_chips_1", "100");
- config->set_property("SignalSource.system_2", "E");
+ config->set_property("SignalSource.system_2", "G");
config->set_property("SignalSource.PRN_2", "21");
config->set_property("SignalSource.CN0_dB_2", "44");
config->set_property("SignalSource.doppler_Hz_2", "2000");
config->set_property("SignalSource.delay_chips_2", "200");
- config->set_property("SignalSource.system_3", "E");
+ config->set_property("SignalSource.system_3", "G");
config->set_property("SignalSource.PRN_3", "22");
config->set_property("SignalSource.CN0_dB_3", "44");
config->set_property("SignalSource.doppler_Hz_3", "3000");
@@ -278,10 +278,10 @@ void GpsL1CaPcpsMultithreadAcquisitionGSoC2013Test::config_2()
std::to_string(integration_time_ms));
config->set_property("Acquisition.max_dwells", "1");
config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_Multithread_Acquisition");
- config->set_property("Acquisition.pfa", "1e-1");
+ config->set_property("Acquisition.pfa", "0.1");
config->set_property("Acquisition.doppler_max", "10000");
config->set_property("Acquisition.doppler_step", "250");
- config->set_property("Acquisition.bit_transition_flag", "true");
+ config->set_property("Acquisition.bit_transition_flag", "false");
config->set_property("Acquisition.dump", "false");
}
diff --git a/src/tests/gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc
new file mode 100644
index 000000000..a84013571
--- /dev/null
+++ b/src/tests/gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc
@@ -0,0 +1,579 @@
+/*!
+ * \file gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc
+ * \brief This class implements an acquisition test for
+ * GpsL1CaPcpsOpenClAcquisition class.
+ * \author Marc Molina, 2013. marc.molina.pena(at)gmail.com
+ *
+ *
+ * -------------------------------------------------------------------------
+ *
+ * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors)
+ *
+ * GNSS-SDR is a software defined Global Navigation
+ * Satellite Systems receiver
+ *
+ * This file is part of GNSS-SDR.
+ *
+ * GNSS-SDR is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * at your option) any later version.
+ *
+ * GNSS-SDR is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNSS-SDR. If not, see .
+ *
+ * -------------------------------------------------------------------------
+ */
+
+
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include "gnss_block_interface.h"
+#include "in_memory_configuration.h"
+#include "configuration_interface.h"
+#include "gnss_synchro.h"
+#include "gps_l1_ca_pcps_opencl_acquisition.h"
+#include "signal_generator.h"
+//#include "signal_generator.cc"
+#include "signal_generator_c.h"
+//#include "signal_generator_c.cc"
+#include "fir_filter.h"
+#include "gen_signal_source.h"
+#include "gnss_sdr_valve.h"
+#include "boost/shared_ptr.hpp"
+
+
+class GpsL1CaPcpsOpenClAcquisitionGSoC2013Test: public ::testing::Test
+{
+protected:
+ GpsL1CaPcpsOpenClAcquisitionGSoC2013Test()
+ {
+ queue = gr::msg_queue::make(0);
+ top_block = gr::make_top_block("Acquisition test");
+ item_size = sizeof(gr_complex);
+ stop = false;
+ message = 0;
+ }
+
+ ~GpsL1CaPcpsOpenClAcquisitionGSoC2013Test()
+ {
+ }
+
+ void init();
+ void config_1();
+ void config_2();
+ void start_queue();
+ void wait_message();
+ void process_message();
+ void stop_queue();
+
+ gr::msg_queue::sptr queue;
+ gr::top_block_sptr top_block;
+ GpsL1CaPcpsOpenClAcquisition *acquisition;
+ InMemoryConfiguration* config;
+ Gnss_Synchro gnss_synchro;
+ size_t item_size;
+ concurrent_queue channel_internal_queue;
+ bool stop;
+ int message;
+ boost::thread ch_thread;
+
+ unsigned int integration_time_ms;
+ unsigned int fs_in;
+
+ double expected_delay_chips;
+ double expected_doppler_hz;
+ float max_doppler_error_hz;
+ float max_delay_error_chips;
+
+ unsigned int num_of_realizations;
+ unsigned int realization_counter;
+ unsigned int detection_counter;
+ unsigned int correct_estimation_counter;
+ unsigned int acquired_samples;
+ unsigned int mean_acq_time_us;
+
+ double mse_doppler;
+ double mse_delay;
+
+ double Pd;
+ double Pfa_p;
+ double Pfa_a;
+};
+
+void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::init()
+{
+ message = 0;
+ realization_counter = 0;
+ detection_counter = 0;
+ correct_estimation_counter = 0;
+ acquired_samples = 0;
+ mse_doppler = 0;
+ mse_delay = 0;
+ mean_acq_time_us = 0;
+ Pd = 0;
+ Pfa_p = 0;
+ Pfa_a = 0;
+}
+
+void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::config_1()
+{
+ gnss_synchro.Channel_ID = 0;
+ gnss_synchro.System = 'G';
+ std::string signal = "1C";
+ signal.copy(gnss_synchro.Signal,2,0);
+
+ integration_time_ms = 1;
+ fs_in = 4e6;
+
+ expected_delay_chips = 600;
+ expected_doppler_hz = 750;
+ max_doppler_error_hz = 2/(3*integration_time_ms*1e-3);
+ max_delay_error_chips = 0.50;
+
+ num_of_realizations = 1;
+
+ config = new InMemoryConfiguration();
+
+ config->set_property("GNSS-SDR.internal_fs_hz", std::to_string(fs_in));
+
+ config->set_property("SignalSource.fs_hz", std::to_string(fs_in));
+
+ config->set_property("SignalSource.item_type", "gr_complex");
+
+ config->set_property("SignalSource.num_satellites", "1");
+
+ config->set_property("SignalSource.system_0", "G");
+ config->set_property("SignalSource.PRN_0", "10");
+ config->set_property("SignalSource.CN0_dB_0", "44");
+ config->set_property("SignalSource.doppler_Hz_0", std::to_string(expected_doppler_hz));
+ config->set_property("SignalSource.delay_chips_0", std::to_string(expected_delay_chips));
+
+ config->set_property("SignalSource.noise_flag", "false");
+ config->set_property("SignalSource.data_flag", "false");
+ config->set_property("SignalSource.BW_BB", "0.97");
+
+ config->set_property("InputFilter.implementation", "Fir_Filter");
+ config->set_property("InputFilter.input_item_type", "gr_complex");
+ config->set_property("InputFilter.output_item_type", "gr_complex");
+ config->set_property("InputFilter.taps_item_type", "float");
+ config->set_property("InputFilter.number_of_taps", "11");
+ config->set_property("InputFilter.number_of_bands", "2");
+ config->set_property("InputFilter.band1_begin", "0.0");
+ config->set_property("InputFilter.band1_end", "0.97");
+ config->set_property("InputFilter.band2_begin", "0.98");
+ config->set_property("InputFilter.band2_end", "1.0");
+ config->set_property("InputFilter.ampl1_begin", "1.0");
+ config->set_property("InputFilter.ampl1_end", "1.0");
+ config->set_property("InputFilter.ampl2_begin", "0.0");
+ config->set_property("InputFilter.ampl2_end", "0.0");
+ config->set_property("InputFilter.band1_error", "1.0");
+ config->set_property("InputFilter.band2_error", "1.0");
+ config->set_property("InputFilter.filter_type", "bandpass");
+ config->set_property("InputFilter.grid_density", "16");
+
+ config->set_property("Acquisition.item_type", "gr_complex");
+ config->set_property("Acquisition.if", "0");
+ config->set_property("Acquisition.coherent_integration_time_ms",
+ std::to_string(integration_time_ms));
+ config->set_property("Acquisition.max_dwells", "1");
+ config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_OpenCl_Acquisition");
+ config->set_property("Acquisition.threshold", "0.8");
+ config->set_property("Acquisition.doppler_max", "10000");
+ config->set_property("Acquisition.doppler_step", "250");
+ config->set_property("Acquisition.bit_transition_flag", "false");
+ config->set_property("Acquisition.dump", "false");
+}
+
+void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::config_2()
+{
+ gnss_synchro.Channel_ID = 0;
+ gnss_synchro.System = 'G';
+ std::string signal = "1C";
+ signal.copy(gnss_synchro.Signal,2,0);
+
+ integration_time_ms = 1;
+ fs_in = 4e6;
+
+ expected_delay_chips = 600;
+ expected_doppler_hz = 750;
+ max_doppler_error_hz = 2/(3*integration_time_ms*1e-3);
+ max_delay_error_chips = 0.50;
+
+ num_of_realizations = 100;
+
+ config = new InMemoryConfiguration();
+
+ config->set_property("GNSS-SDR.internal_fs_hz", std::to_string(fs_in));
+
+ config->set_property("SignalSource.fs_hz", std::to_string(fs_in));
+
+ config->set_property("SignalSource.item_type", "gr_complex");
+
+ config->set_property("SignalSource.num_satellites", "4");
+
+ config->set_property("SignalSource.system_0", "G");
+ config->set_property("SignalSource.PRN_0", "10");
+ config->set_property("SignalSource.CN0_dB_0", "44");
+ config->set_property("SignalSource.doppler_Hz_0", std::to_string(expected_doppler_hz));
+ config->set_property("SignalSource.delay_chips_0", std::to_string(expected_delay_chips));
+
+ config->set_property("SignalSource.system_1", "G");
+ config->set_property("SignalSource.PRN_1", "15");
+ config->set_property("SignalSource.CN0_dB_1", "44");
+ config->set_property("SignalSource.doppler_Hz_1", "1000");
+ config->set_property("SignalSource.delay_chips_1", "100");
+
+ config->set_property("SignalSource.system_2", "G");
+ config->set_property("SignalSource.PRN_2", "21");
+ config->set_property("SignalSource.CN0_dB_2", "44");
+ config->set_property("SignalSource.doppler_Hz_2", "2000");
+ config->set_property("SignalSource.delay_chips_2", "200");
+
+ config->set_property("SignalSource.system_3", "G");
+ config->set_property("SignalSource.PRN_3", "22");
+ config->set_property("SignalSource.CN0_dB_3", "44");
+ config->set_property("SignalSource.doppler_Hz_3", "3000");
+ config->set_property("SignalSource.delay_chips_3", "300");
+
+ config->set_property("SignalSource.noise_flag", "true");
+ config->set_property("SignalSource.data_flag", "true");
+ config->set_property("SignalSource.BW_BB", "0.97");
+
+ config->set_property("InputFilter.implementation", "Fir_Filter");
+ config->set_property("InputFilter.input_item_type", "gr_complex");
+ config->set_property("InputFilter.output_item_type", "gr_complex");
+ config->set_property("InputFilter.taps_item_type", "float");
+ config->set_property("InputFilter.number_of_taps", "11");
+ config->set_property("InputFilter.number_of_bands", "2");
+ config->set_property("InputFilter.band1_begin", "0.0");
+ config->set_property("InputFilter.band1_end", "0.97");
+ config->set_property("InputFilter.band2_begin", "0.98");
+ config->set_property("InputFilter.band2_end", "1.0");
+ config->set_property("InputFilter.ampl1_begin", "1.0");
+ config->set_property("InputFilter.ampl1_end", "1.0");
+ config->set_property("InputFilter.ampl2_begin", "0.0");
+ config->set_property("InputFilter.ampl2_end", "0.0");
+ config->set_property("InputFilter.band1_error", "1.0");
+ config->set_property("InputFilter.band2_error", "1.0");
+ config->set_property("InputFilter.filter_type", "bandpass");
+ config->set_property("InputFilter.grid_density", "16");
+
+ config->set_property("Acquisition.item_type", "gr_complex");
+ config->set_property("Acquisition.if", "0");
+ config->set_property("Acquisition.coherent_integration_time_ms",
+ std::to_string(integration_time_ms));
+ config->set_property("Acquisition.max_dwells", "1");
+ config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_OpenCl_Acquisition");
+ config->set_property("Acquisition.pfa", "0.1");
+ config->set_property("Acquisition.doppler_max", "10000");
+ config->set_property("Acquisition.doppler_step", "250");
+ config->set_property("Acquisition.bit_transition_flag", "false");
+ config->set_property("Acquisition.dump", "false");
+}
+
+void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::start_queue()
+{
+ stop = false;
+ ch_thread = boost::thread(&GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::wait_message, this);
+}
+
+void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::wait_message()
+{
+ struct timeval tv;
+ long long int begin = 0;
+ long long int end = 0;
+
+ while (!stop)
+ {
+ acquisition->reset();
+
+ gettimeofday(&tv, NULL);
+ begin = tv.tv_sec *1e6 + tv.tv_usec;
+
+ channel_internal_queue.wait_and_pop(message);
+
+ gettimeofday(&tv, NULL);
+ end = tv.tv_sec *1e6 + tv.tv_usec;
+
+ mean_acq_time_us += (end-begin);
+
+ process_message();
+ }
+}
+
+void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::process_message()
+{
+ if (message == 1)
+ {
+ detection_counter++;
+
+ // The term -5 is here to correct the additional delay introduced by the FIR filter
+ double delay_error_chips = abs((double)expected_delay_chips - (double)(gnss_synchro.Acq_delay_samples-5)*1023.0/((double)fs_in*1e-3));
+ double doppler_error_hz = abs(expected_doppler_hz - gnss_synchro.Acq_doppler_hz);
+
+ mse_delay += std::pow(delay_error_chips, 2);
+ mse_doppler += std::pow(doppler_error_hz, 2);
+
+ if ((delay_error_chips < max_delay_error_chips) && (doppler_error_hz < max_doppler_error_hz))
+ {
+ correct_estimation_counter++;
+ }
+
+// std::cout << "Acq delay samples = " << (double)gnss_synchro.Acq_delay_samples << std::endl;
+// std::cout << "Acq doppler Hz = " << (double)gnss_synchro.Acq_doppler_hz << std::endl;
+ }
+
+ realization_counter++;
+
+ std::cout << "Progress: " << round((float)realization_counter/num_of_realizations*100) << "% \r" << std::flush;
+
+ if (realization_counter == num_of_realizations)
+ {
+ mse_delay /= num_of_realizations;
+ mse_doppler /= num_of_realizations;
+
+ Pd = (double)correct_estimation_counter / (double)num_of_realizations;
+ Pfa_a = (double)detection_counter / (double)num_of_realizations;
+ Pfa_p = (double)(detection_counter-correct_estimation_counter) / (double)num_of_realizations;
+
+ mean_acq_time_us /= num_of_realizations;
+
+ stop_queue();
+ top_block->stop();
+
+ std::cout << std::endl;
+ }
+}
+
+void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::stop_queue()
+{
+ stop = true;
+}
+
+TEST_F(GpsL1CaPcpsOpenClAcquisitionGSoC2013Test, Instantiate)
+{
+ config_1();
+ acquisition = new GpsL1CaPcpsOpenClAcquisition(config, "Acquisition", 1, 1, queue);
+ delete acquisition;
+ delete config;
+}
+
+TEST_F(GpsL1CaPcpsOpenClAcquisitionGSoC2013Test, ConnectAndRun)
+{
+ int nsamples = floor(fs_in*integration_time_ms*1e-3);
+ struct timeval tv;
+ long long int begin = 0;
+ long long int end = 0;
+
+ config_1();
+ acquisition = new GpsL1CaPcpsOpenClAcquisition(config, "Acquisition", 1, 1, queue);
+
+ ASSERT_NO_THROW( {
+ acquisition->connect(top_block);
+ boost::shared_ptr source = gr::analog::sig_source_c::make(fs_in, gr::analog::GR_SIN_WAVE, 1000, 1, gr_complex(0));
+ boost::shared_ptr valve = gnss_sdr_make_valve(sizeof(gr_complex), nsamples, queue);
+ top_block->connect(source, 0, valve, 0);
+ top_block->connect(valve, 0, acquisition->get_left_block(), 0);
+ }) << "Failure connecting the blocks of acquisition test."<< std::endl;
+
+ EXPECT_NO_THROW( {
+ gettimeofday(&tv, NULL);
+ begin = tv.tv_sec *1e6 + tv.tv_usec;
+ top_block->run(); // Start threads and wait
+ gettimeofday(&tv, NULL);
+ end = tv.tv_sec *1e6 + tv.tv_usec;
+ }) << "Failure running the top_block."<< std::endl;
+
+ std::cout << "Processed " << nsamples << " samples in " << (end-begin) << " microseconds" << std::endl;
+
+ delete acquisition;
+ delete config;
+}
+
+TEST_F(GpsL1CaPcpsOpenClAcquisitionGSoC2013Test, ValidationOfResults)
+{
+ config_1();
+
+ acquisition = new GpsL1CaPcpsOpenClAcquisition(config, "Acquisition", 1, 1, queue);
+
+ ASSERT_NO_THROW( {
+ acquisition->set_channel(1);
+ }) << "Failure setting channel."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_gnss_synchro(&gnss_synchro);
+ }) << "Failure setting gnss_synchro."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_channel_queue(&channel_internal_queue);
+ }) << "Failure setting channel_internal_queue."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_doppler_max(config->property("Acquisition.doppler_max", 10000));
+ }) << "Failure setting doppler_max."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_doppler_step(config->property("Acquisition.doppler_step", 500));
+ }) << "Failure setting doppler_step."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_threshold(config->property("Acquisition.threshold", 0.0));
+ }) << "Failure setting threshold."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->connect(top_block);
+ }) << "Failure connecting acquisition to the top_block."<< std::endl;
+
+ acquisition->init();
+
+ ASSERT_NO_THROW( {
+ boost::shared_ptr signal_source;
+ SignalGenerator* signal_generator = new SignalGenerator(config, "SignalSource", 0, 1, queue);
+ FirFilter* filter = new FirFilter(config, "InputFilter", 1, 1, queue);
+ signal_source.reset(new GenSignalSource(config, signal_generator, filter, "SignalSource", queue));
+ signal_source->connect(top_block);
+ top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0);
+ }) << "Failure connecting the blocks of acquisition test." << std::endl;
+
+ // i = 0 --> sallite in acquisition is visible
+ // i = 1 --> satellite in acquisition is not visible
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ init();
+
+ if (i == 0)
+ {
+ gnss_synchro.PRN = 10; // This satellite is visible
+ }
+ else if (i == 1)
+ {
+ gnss_synchro.PRN = 20; // This satellite is not visible
+ }
+
+ acquisition->set_local_code();
+
+ start_queue();
+
+ EXPECT_NO_THROW( {
+ top_block->run(); // Start threads and wait
+ }) << "Failure running he top_block."<< std::endl;
+
+ if (i == 0)
+ {
+ EXPECT_EQ(1, message) << "Acquisition failure. Expected message: 1=ACQ SUCCESS.";
+ if (message == 1)
+ {
+ EXPECT_EQ((unsigned int)1, correct_estimation_counter) << "Acquisition failure. Incorrect parameters estimation.";
+ }
+
+ }
+ else if (i == 1)
+ {
+ EXPECT_EQ(2, message) << "Acquisition failure. Expected message: 2=ACQ FAIL.";
+ }
+ }
+
+ delete acquisition;
+ delete config;
+}
+
+TEST_F(GpsL1CaPcpsOpenClAcquisitionGSoC2013Test, ValidationOfResultsProbabilities)
+{
+ config_2();
+
+ acquisition = new GpsL1CaPcpsOpenClAcquisition(config, "Acquisition", 1, 1, queue);
+
+ ASSERT_NO_THROW( {
+ acquisition->set_channel(1);
+ }) << "Failure setting channel."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_gnss_synchro(&gnss_synchro);
+ }) << "Failure setting gnss_synchro."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_channel_queue(&channel_internal_queue);
+ }) << "Failure setting channel_internal_queue."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_doppler_max(config->property("Acquisition.doppler_max", 10000));
+ }) << "Failure setting doppler_max."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_doppler_step(config->property("Acquisition.doppler_step", 500));
+ }) << "Failure setting doppler_step."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->set_threshold(config->property("Acquisition.threshold", 0.0));
+ }) << "Failure setting threshold."<< std::endl;
+
+ ASSERT_NO_THROW( {
+ acquisition->connect(top_block);
+ }) << "Failure connecting acquisition to the top_block."<< std::endl;
+
+ acquisition->init();
+
+ ASSERT_NO_THROW( {
+ boost::shared_ptr signal_source;
+ SignalGenerator* signal_generator = new SignalGenerator(config, "SignalSource", 0, 1, queue);
+ FirFilter* filter = new FirFilter(config, "InputFilter", 1, 1, queue);
+ signal_source.reset(new GenSignalSource(config, signal_generator, filter, "SignalSource", queue));
+ signal_source->connect(top_block);
+ top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0);
+ }) << "Failure connecting the blocks of acquisition test." << std::endl;
+
+ std::cout << "Probability of false alarm (target) = " << 0.1 << std::endl;
+
+ // i = 0 --> sallite in acquisition is visible (prob of detection and prob of detection with wrong estimation)
+ // i = 1 --> satellite in acquisition is not visible (prob of false detection)
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ init();
+
+ if (i == 0)
+ {
+ gnss_synchro.PRN = 10; // This satellite is visible
+ }
+ else if (i == 1)
+ {
+ gnss_synchro.PRN = 20; // This satellite is not visible
+ }
+
+ acquisition->set_local_code();
+
+ start_queue();
+
+ EXPECT_NO_THROW( {
+ top_block->run(); // Start threads and wait
+ }) << "Failure running he top_block."<< std::endl;
+
+ if (i == 0)
+ {
+ std::cout << "Probability of detection = " << Pd << std::endl;
+ std::cout << "Probability of false alarm (satellite present) = " << Pfa_p << std::endl;
+// std::cout << "Mean acq time = " << mean_acq_time_us << " microseconds." << std::endl;
+ }
+ else if (i == 1)
+ {
+ std::cout << "Probability of false alarm (satellite absent) = " << Pfa_a << std::endl;
+// std::cout << "Mean acq time = " << mean_acq_time_us << " microseconds." << std::endl;
+ }
+ }
+
+ delete acquisition;
+ delete config;
+}
diff --git a/src/tests/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc
index a906326bd..71ad622bd 100644
--- a/src/tests/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc
+++ b/src/tests/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc
@@ -191,8 +191,8 @@ void GpsL1CaPcpsTongAcquisitionGSoC2013Test::config_1()
std::to_string(integration_time_ms));
config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_Tong_Acquisition");
config->set_property("Acquisition.threshold", "0.8");
- config->set_property("Acquisition.tong_init_val", "5");
- config->set_property("Acquisition.tong_max_val", "10");
+ config->set_property("Acquisition.tong_init_val", "1");
+ config->set_property("Acquisition.tong_max_val", "8");
config->set_property("Acquisition.doppler_max", "10000");
config->set_property("Acquisition.doppler_step", "250");
config->set_property("Acquisition.dump", "false");
@@ -237,14 +237,14 @@ void GpsL1CaPcpsTongAcquisitionGSoC2013Test::config_2()
config->set_property("SignalSource.doppler_Hz_1", "1000");
config->set_property("SignalSource.delay_chips_1", "100");
- config->set_property("SignalSource.system_2", "E");
- config->set_property("SignalSource.PRN_2", "10");
+ config->set_property("SignalSource.system_2", "G");
+ config->set_property("SignalSource.PRN_2", "21");
config->set_property("SignalSource.CN0_dB_2", "44");
config->set_property("SignalSource.doppler_Hz_2", "2000");
config->set_property("SignalSource.delay_chips_2", "200");
- config->set_property("SignalSource.system_3", "E");
- config->set_property("SignalSource.PRN_3", "20");
+ config->set_property("SignalSource.system_3", "G");
+ config->set_property("SignalSource.PRN_3", "22");
config->set_property("SignalSource.CN0_dB_3", "44");
config->set_property("SignalSource.doppler_Hz_3", "3000");
config->set_property("SignalSource.delay_chips_3", "300");
@@ -277,9 +277,9 @@ void GpsL1CaPcpsTongAcquisitionGSoC2013Test::config_2()
config->set_property("Acquisition.coherent_integration_time_ms",
std::to_string(integration_time_ms));
config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_Tong_Acquisition");
- config->set_property("Acquisition.threshold", "0.002");
- config->set_property("Acquisition.tong_init_val", "5");
- config->set_property("Acquisition.tong_max_val", "10");
+ config->set_property("Acquisition.threshold", "0.00108"); // Pfa,a = 0.1
+ config->set_property("Acquisition.tong_init_val", "1");
+ config->set_property("Acquisition.tong_max_val", "8");
config->set_property("Acquisition.doppler_max", "10000");
config->set_property("Acquisition.doppler_step", "250");
config->set_property("Acquisition.dump", "false");
@@ -532,7 +532,7 @@ TEST_F(GpsL1CaPcpsTongAcquisitionGSoC2013Test, ValidationOfResultsProbabilities)
top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0);
}) << "Failure connecting the blocks of acquisition test." << std::endl;
- std::cout << "Probability of false alarm (target) = " << 0.0 << std::endl;
+ std::cout << "Probability of false alarm (target) = " << 0.1 << std::endl;
// i = 0 --> sallite in acquisition is visible (prob of detection and prob of detection with wrong estimation)
// i = 1 --> satellite in acquisition is not visible (prob of false detection)
diff --git a/src/tests/test_main.cc b/src/tests/test_main.cc
index d702592ab..d11f325a8 100644
--- a/src/tests/test_main.cc
+++ b/src/tests/test_main.cc
@@ -73,6 +73,9 @@
//#include "gnss_block/gps_l1_ca_pcps_acquisition_test.cc"
#include "gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc"
#include "gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc"
+#if OPENCL
+ #include "gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc"
+#endif
#include "gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc"
//#include "gnss_block/galileo_e1_pcps_ambiguous_acquisition_test.cc"
//#include "gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc_test.cc"