diff --git a/CMakeLists.txt b/CMakeLists.txt index a3207b2e3..6040211a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,6 +71,11 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") endif(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") +######################################################################## +# Find OpenCL installation +######################################################################## +find_package(OpenCL) + ################################################################################ # Googletest - http://code.google.com/p/googletest/ @@ -520,3 +525,4 @@ add_custom_target(uninstall ######################################################################## add_subdirectory(src) + diff --git a/cmake/Modules/FindOpenCL.cmake b/cmake/Modules/FindOpenCL.cmake new file mode 100644 index 000000000..1229090a9 --- /dev/null +++ b/cmake/Modules/FindOpenCL.cmake @@ -0,0 +1,99 @@ +# +# This file taken from FindOpenCL project @ http://gitorious.com/findopencl +# +# - Try to find OpenCL +# This module tries to find an OpenCL implementation on your system. It supports +# AMD / ATI, Apple and NVIDIA implementations, but shoudl work, too. +# +# Once done this will define +# OPENCL_FOUND - system has OpenCL +# OPENCL_INCLUDE_DIRS - the OpenCL include directory +# OPENCL_LIBRARIES - link these to use OpenCL +# +# WIN32 should work, but is untested + +FIND_PACKAGE( PackageHandleStandardArgs ) + +SET (OPENCL_VERSION_STRING "0.1.0") +SET (OPENCL_VERSION_MAJOR 0) +SET (OPENCL_VERSION_MINOR 1) +SET (OPENCL_VERSION_PATCH 0) + +IF (APPLE) + + FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX") + FIND_PATH(OPENCL_INCLUDE_DIRS OpenCL/cl.h DOC "Include for OpenCL on OSX") + FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS OpenCL/cl.hpp DOC "Include for OpenCL CPP bindings on OSX") + +ELSE (APPLE) + + IF (WIN32) + + FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h) + FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp) + + # The AMD SDK currently installs both x86 and x86_64 libraries + # This is only a hack to find out architecture + IF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" ) + SET(OPENCL_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86_64") + SET(OPENCL_LIB_DIR "$ENV{ATIINTERNALSTREAMSDKROOT}/lib/x86_64") + ELSE (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64") + SET(OPENCL_LIB_DIR "$ENV{ATISTREAMSDKROOT}/lib/x86") + SET(OPENCL_LIB_DIR "$ENV{ATIINTERNALSTREAMSDKROOT}/lib/x86") + ENDIF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" ) + + # find out if the user asked for a 64-bit build, and use the corresponding + # 64 or 32 bit NVIDIA library paths to the search: + STRING(REGEX MATCH "Win64" ISWIN64 ${CMAKE_GENERATOR}) + IF("${ISWIN64}" STREQUAL "Win64") + FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/x64) + ELSE("${ISWIN64}" STREQUAL "Win64") + FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/Win32) + ENDIF("${ISWIN64}" STREQUAL "Win64") + + GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE) + + # On Win32 search relative to the library + FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include) + FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include) + + ELSE (WIN32) + + # Unix style platforms + FIND_LIBRARY(OPENCL_LIBRARIES OpenCL + ENV LD_LIBRARY_PATH + ) + + GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH) + GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE) + + # The AMD SDK currently does not place its headers + # in /usr/include, therefore also search relative + # to the library + FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include") + FIND_PATH(_OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include") + + ENDIF (WIN32) + +ENDIF (APPLE) + +FIND_PACKAGE_HANDLE_STANDARD_ARGS( OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS ) + +IF( _OPENCL_CPP_INCLUDE_DIRS ) + SET( OPENCL_HAS_CPP_BINDINGS TRUE ) + LIST( APPEND OPENCL_INCLUDE_DIRS ${_OPENCL_CPP_INCLUDE_DIRS} ) + # This is often the same, so clean up + LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS ) +ENDIF( _OPENCL_CPP_INCLUDE_DIRS ) + +MARK_AS_ADVANCED( + OPENCL_INCLUDE_DIRS +) + +IF( OPENCL_INCLUDE_DIRS AND OPENCL_LIBRARIES ) + SET( OPENCL_FOUND TRUE ) + add_definitions( -DOPENCL=1 ) +ELSE( OPENCL_INCLUDE_DIRS AND OPENCL_LIBRARIES ) + SET( OPENCL_FOUND FALSE ) + add_definitions( -DOPENCL=0 ) +ENDIF( OPENCL_INCLUDE_DIRS AND OPENCL_LIBRARIES ) diff --git a/install/math_kernel.cl b/install/math_kernel.cl new file mode 100644 index 000000000..3b1601877 --- /dev/null +++ b/install/math_kernel.cl @@ -0,0 +1,29 @@ +#define MUL_RE(a,b) (a.x*b.x - a.y*b.y) +#define MUL_IM(a,b) (a.x*b.y + a.y*b.x) +#define SUM_RE(a,b) (a.x + b.x) +#define SUM_IM(a,b) (a.y + b.y) + +__kernel void add_vectors(__global const float2* src1, __global const float2* src2, __global float2* dest) +{ + int gid = get_global_id(0); + dest[gid] = (float2)(SUM_RE(src1[gid],src2[gid]),SUM_IM(src1[gid],src2[gid])); +} + +__kernel void mult_vectors(__global const float2* src1, __global const float2* src2, __global float2* dest) +{ + int gid = get_global_id(0); + dest[gid] = (float2)(MUL_RE(src1[gid],src2[gid]),MUL_IM(src1[gid],src2[gid])); +} + +__kernel void conj_vector(__global const float2* src, __global float2* dest) +{ + int gid = get_global_id(0); + dest[gid] = ((float2)(1,-1)) * src[gid]; +} + +__kernel void magnitude_squared(__global const float2* src, __global float* dest) +{ + int gid = get_global_id(0); + dest[gid] = src[gid].x*src[gid].x + src[gid].y*src[gid].y; +} + diff --git a/src/algorithms/acquisition/adapters/CMakeLists.txt b/src/algorithms/acquisition/adapters/CMakeLists.txt index 631ad5f78..097f608c4 100644 --- a/src/algorithms/acquisition/adapters/CMakeLists.txt +++ b/src/algorithms/acquisition/adapters/CMakeLists.txt @@ -16,17 +16,32 @@ # along with GNSS-SDR. If not, see . # -set(ACQ_ADAPTER_SOURCES - gps_l1_ca_pcps_acquisition.cc - gps_l1_ca_pcps_multithread_acquisition.cc - gps_l1_ca_pcps_assisted_acquisition.cc - gps_l1_ca_pcps_acquisition_fine_doppler.cc - gps_l1_ca_pcps_tong_acquisition.cc - galileo_e1_pcps_ambiguous_acquisition.cc - galileo_e1_pcps_cccwsr_ambiguous_acquisition.cc - galileo_e1_pcps_tong_ambiguous_acquisition.cc - galileo_e1_pcps_8ms_ambiguous_acquisition.cc -) +if(OPENCL_FOUND) + set(ACQ_ADAPTER_SOURCES + gps_l1_ca_pcps_acquisition.cc + gps_l1_ca_pcps_multithread_acquisition.cc + gps_l1_ca_pcps_assisted_acquisition.cc + gps_l1_ca_pcps_acquisition_fine_doppler.cc + gps_l1_ca_pcps_tong_acquisition.cc + gps_l1_ca_pcps_opencl_acquisition.cc + galileo_e1_pcps_ambiguous_acquisition.cc + galileo_e1_pcps_cccwsr_ambiguous_acquisition.cc + galileo_e1_pcps_tong_ambiguous_acquisition.cc + galileo_e1_pcps_8ms_ambiguous_acquisition.cc + ) +else(OPENCL_FOUND) + set(ACQ_ADAPTER_SOURCES + gps_l1_ca_pcps_acquisition.cc + gps_l1_ca_pcps_multithread_acquisition.cc + gps_l1_ca_pcps_assisted_acquisition.cc + gps_l1_ca_pcps_acquisition_fine_doppler.cc + gps_l1_ca_pcps_tong_acquisition.cc + galileo_e1_pcps_ambiguous_acquisition.cc + galileo_e1_pcps_cccwsr_ambiguous_acquisition.cc + galileo_e1_pcps_tong_ambiguous_acquisition.cc + galileo_e1_pcps_8ms_ambiguous_acquisition.cc + ) +endif(OPENCL_FOUND) include_directories( $(CMAKE_CURRENT_SOURCE_DIR) diff --git a/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_multithread_acquisition.h b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_multithread_acquisition.h index cafc3c693..1738d28dc 100644 --- a/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_multithread_acquisition.h +++ b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_multithread_acquisition.h @@ -29,8 +29,8 @@ * ------------------------------------------------------------------------- */ -#ifndef GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_CQUISITION_H_ -#define GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_CQUISITION_H_ +#ifndef GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_ACQUISITION_H_ +#define GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_ACQUISITION_H_ #include "gnss_synchro.h" #include "acquisition_interface.h" @@ -159,4 +159,4 @@ private: float calculate_threshold(float pfa); }; -#endif /* GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_CQUISITION_H_ */ +#endif /* GNSS_SDR_GPS_L1_CA_PCPS_MULTITHREAD_ACQUISITION_H_ */ diff --git a/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.cc b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.cc new file mode 100644 index 000000000..a39b68f35 --- /dev/null +++ b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.cc @@ -0,0 +1,296 @@ +/*! + * \file gps_l1_ca_pcps_opencl_acquisition.cc + * \brief Adapts an OpenCL PCPS acquisition block to an + * AcquisitionInterface for GPS L1 C/A signals + * \author Marc Molina, 2013. marc.molina.pena(at)gmail.com + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#include "gps_l1_ca_pcps_opencl_acquisition.h" +#include "gps_sdr_signal_processing.h" +#include "GPS_L1_CA.h" +#include "configuration_interface.h" +#include +#include +#include +#include +#include +#include + +using google::LogMessage; + +GpsL1CaPcpsOpenClAcquisition::GpsL1CaPcpsOpenClAcquisition( + ConfigurationInterface* configuration, std::string role, + unsigned int in_streams, unsigned int out_streams, + gr::msg_queue::sptr queue) : + role_(role), in_streams_(in_streams), out_streams_(out_streams), queue_(queue) +{ + configuration_ = configuration; + std::string default_item_type = "gr_complex"; + std::string default_dump_filename = "./data/acquisition.dat"; + + DLOG(INFO) << "role " << role; + + item_type_ = configuration_->property(role + ".item_type", + default_item_type); + + fs_in_ = configuration_->property("GNSS-SDR.internal_fs_hz", 2048000); + if_ = configuration_->property(role + ".ifreq", 0); + dump_ = configuration_->property(role + ".dump", false); + shift_resolution_ = configuration_->property(role + ".doppler_max", 15); + sampled_ms_ = configuration_->property(role + ".coherent_integration_time_ms", 1); + + bit_transition_flag_ = configuration_->property("Acquisition.bit_transition_flag", false); + + if (!bit_transition_flag_) + { + max_dwells_ = configuration_->property(role + ".max_dwells", 1); + } + else + { + max_dwells_ = 2; + } + + dump_filename_ = configuration_->property(role + ".dump_filename", + default_dump_filename); + + //--- Find number of samples per spreading code ------------------------- + code_length_ = round(fs_in_ + / (GPS_L1_CA_CODE_RATE_HZ / GPS_L1_CA_CODE_LENGTH_CHIPS)); + + vector_length_ = code_length_ * sampled_ms_; + + code_= new gr_complex[vector_length_]; + + if (item_type_.compare("gr_complex") == 0) + { + item_size_ = sizeof(gr_complex); + acquisition_cc_ = pcps_make_opencl_acquisition_cc(sampled_ms_, max_dwells_, + shift_resolution_, if_, fs_in_, code_length_, code_length_, + bit_transition_flag_, queue_, dump_, dump_filename_); + + stream_to_vector_ = gr::blocks::stream_to_vector::make(item_size_, vector_length_); + + DLOG(INFO) << "stream_to_vector(" << stream_to_vector_->unique_id() + << ")"; + DLOG(INFO) << "acquisition(" << acquisition_cc_->unique_id() + << ")"; + } + else + { + LOG_AT_LEVEL(WARNING) << item_type_ + << " unknown acquisition item type"; + } +} + + +GpsL1CaPcpsOpenClAcquisition::~GpsL1CaPcpsOpenClAcquisition() +{ + delete[] code_; +} + + +void GpsL1CaPcpsOpenClAcquisition::set_channel(unsigned int channel) +{ + channel_ = channel; + if (item_type_.compare("gr_complex") == 0) + { + acquisition_cc_->set_channel(channel_); + } +} + + +void GpsL1CaPcpsOpenClAcquisition::set_threshold(float threshold) +{ + float pfa = configuration_->property(role_ + boost::lexical_cast(channel_) + ".pfa", 0.0); + + if(pfa==0.0) + { + pfa = configuration_->property(role_+".pfa", 0.0); + } + if(pfa==0.0) + { + threshold_ = threshold; + } + else + { + threshold_ = calculate_threshold(pfa); + } + + DLOG(INFO) <<"Channel "<set_threshold(threshold_); + } +} + + +void GpsL1CaPcpsOpenClAcquisition::set_doppler_max(unsigned int doppler_max) +{ + doppler_max_ = doppler_max; + if (item_type_.compare("gr_complex") == 0) + { + acquisition_cc_->set_doppler_max(doppler_max_); + } +} + + +void GpsL1CaPcpsOpenClAcquisition::set_doppler_step(unsigned int doppler_step) +{ + doppler_step_ = doppler_step; + if (item_type_.compare("gr_complex") == 0) + { + acquisition_cc_->set_doppler_step(doppler_step_); + } + +} + + +void GpsL1CaPcpsOpenClAcquisition::set_channel_queue( + concurrent_queue *channel_internal_queue) +{ + channel_internal_queue_ = channel_internal_queue; + if (item_type_.compare("gr_complex") == 0) + { + acquisition_cc_->set_channel_queue(channel_internal_queue_); + } +} + + +void GpsL1CaPcpsOpenClAcquisition::set_gnss_synchro(Gnss_Synchro* gnss_synchro) +{ + gnss_synchro_ = gnss_synchro; + if (item_type_.compare("gr_complex") == 0) + { + acquisition_cc_->set_gnss_synchro(gnss_synchro_); + } +} + + +signed int GpsL1CaPcpsOpenClAcquisition::mag() +{ + if (item_type_.compare("gr_complex") == 0) + { + return acquisition_cc_->mag(); + } + else + { + return 0; + } +} + + +void GpsL1CaPcpsOpenClAcquisition::init() +{ + acquisition_cc_->init(); + set_local_code(); +} + + +void GpsL1CaPcpsOpenClAcquisition::set_local_code() +{ + if (item_type_.compare("gr_complex") == 0) + { + std::complex* code = new std::complex[code_length_]; + + gps_l1_ca_code_gen_complex_sampled(code, gnss_synchro_->PRN, fs_in_, 0); + + for (unsigned int i = 0; i < sampled_ms_; i++) + { + memcpy(&(code_[i*code_length_]), code, + sizeof(gr_complex)*code_length_); + } + + acquisition_cc_->set_local_code(code_); + + delete[] code; + } +} + + +void GpsL1CaPcpsOpenClAcquisition::reset() +{ + if (item_type_.compare("gr_complex") == 0) + { + acquisition_cc_->set_active(true); + } +} + + +float GpsL1CaPcpsOpenClAcquisition::calculate_threshold(float pfa) +{ + //Calculate the threshold + + unsigned int frequency_bins = 0; + for (int doppler = (int)(-doppler_max_); doppler <= (int)doppler_max_; doppler += doppler_step_) + { + frequency_bins++; + } + + DLOG(INFO) <<"Channel "<connect(stream_to_vector_, 0, acquisition_cc_, 0); + } + +} + + +void GpsL1CaPcpsOpenClAcquisition::disconnect(gr::top_block_sptr top_block) +{ + if (item_type_.compare("gr_complex") == 0) + { + top_block->disconnect(stream_to_vector_, 0, acquisition_cc_, 0); + } +} + + +gr::basic_block_sptr GpsL1CaPcpsOpenClAcquisition::get_left_block() +{ + return stream_to_vector_; +} + + +gr::basic_block_sptr GpsL1CaPcpsOpenClAcquisition::get_right_block() +{ + return acquisition_cc_; +} + diff --git a/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.h b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.h new file mode 100644 index 000000000..c76813edd --- /dev/null +++ b/src/algorithms/acquisition/adapters/gps_l1_ca_pcps_opencl_acquisition.h @@ -0,0 +1,162 @@ +/*! + * \file gps_l1_ca_pcps_opencl_acquisition.h + * \brief Adapts an OpenCL PCPS acquisition block to an + * AcquisitionInterface for GPS L1 C/A signals + * \author Marc Molina, 2013. marc.molina.pena(at)gmail.com + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef GNSS_SDR_GPS_L1_CA_PCPS_OPENCL_ACQUISITION_H_ +#define GNSS_SDR_GPS_L1_CA_PCPS_OPENCL_ACQUISITION_H_ + +#include "gnss_synchro.h" +#include "acquisition_interface.h" +#include "pcps_opencl_acquisition_cc.h" +#include +#include + + +class ConfigurationInterface; + +/*! + * \brief This class adapts an OpenCL PCPS acquisition block to an + * AcquisitionInterface for GPS L1 C/A signals + */ +class GpsL1CaPcpsOpenClAcquisition: public AcquisitionInterface +{ +public: + GpsL1CaPcpsOpenClAcquisition(ConfigurationInterface* configuration, + std::string role, unsigned int in_streams, + unsigned int out_streams, boost::shared_ptr queue); + + virtual ~GpsL1CaPcpsOpenClAcquisition(); + + std::string role() + { + return role_; + } + + /*! + * \brief Returns "GPS_L1_CA_PCPS_OpenCl_Acquisition" + */ + std::string implementation() + { + return "GPS_L1_CA_PCPS_OpenCl_Acquisition"; + } + size_t item_size() + { + return item_size_; + } + + void connect(gr::top_block_sptr top_block); + void disconnect(gr::top_block_sptr top_block); + gr::basic_block_sptr get_left_block(); + gr::basic_block_sptr get_right_block(); + + /*! + * \brief Set acquisition/tracking common Gnss_Synchro object pointer + * to efficiently exchange synchronization data between acquisition and + * tracking blocks + */ + void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro); + + /*! + * \brief Set acquisition channel unique ID + */ + void set_channel(unsigned int channel); + + /*! + * \brief Set statistics threshold of PCPS algorithm + */ + void set_threshold(float threshold); + + /*! + * \brief Set maximum Doppler off grid search + */ + void set_doppler_max(unsigned int doppler_max); + + /*! + * \brief Set Doppler steps for the grid search + */ + void set_doppler_step(unsigned int doppler_step); + + /*! + * \brief Set tracking channel internal queue + */ + void set_channel_queue(concurrent_queue *channel_internal_queue); + + /*! + * \brief Initializes acquisition algorithm. + */ + void init(); + + /*! + * \brief Sets local code for GPS L1/CA PCPS acquisition algorithm. + */ + void set_local_code(); + + /*! + * \brief Returns the maximum peak of grid search + */ + signed int mag(); + + /*! + * \brief Restart acquisition algorithm + */ + void reset(); + +private: + ConfigurationInterface* configuration_; + pcps_opencl_acquisition_cc_sptr acquisition_cc_; + gr::blocks::stream_to_vector::sptr stream_to_vector_; + size_t item_size_; + std::string item_type_; + unsigned int vector_length_; + unsigned int code_length_; + bool bit_transition_flag_; + unsigned int channel_; + float threshold_; + unsigned int doppler_max_; + unsigned int doppler_step_; + unsigned int shift_resolution_; + unsigned int sampled_ms_; + unsigned int max_dwells_; + long fs_in_; + long if_; + bool dump_; + std::string dump_filename_; + std::complex * code_; + Gnss_Synchro * gnss_synchro_; + std::string role_; + unsigned int in_streams_; + unsigned int out_streams_; + boost::shared_ptr queue_; + concurrent_queue *channel_internal_queue_; + + float calculate_threshold(float pfa); +}; + +#endif /* GNSS_SDR_GPS_L1_CA_PCPS_OPENCL_ACQUISITION_H_ */ diff --git a/src/algorithms/acquisition/gnuradio_blocks/CMakeLists.txt b/src/algorithms/acquisition/gnuradio_blocks/CMakeLists.txt index 6ea1114ee..113f72e7b 100644 --- a/src/algorithms/acquisition/gnuradio_blocks/CMakeLists.txt +++ b/src/algorithms/acquisition/gnuradio_blocks/CMakeLists.txt @@ -16,15 +16,28 @@ # along with GNSS-SDR. If not, see . # -set(ACQ_GR_BLOCKS_SOURCES - pcps_acquisition_cc.cc - pcps_multithread_acquisition_cc.cc - pcps_assisted_acquisition_cc.cc - pcps_acquisition_fine_doppler_cc.cc - pcps_tong_acquisition_cc.cc - pcps_cccwsr_acquisition_cc.cc - galileo_pcps_8ms_acquisition_cc.cc -) +if(OPENCL_FOUND) + set(ACQ_GR_BLOCKS_SOURCES + pcps_acquisition_cc.cc + pcps_multithread_acquisition_cc.cc + pcps_assisted_acquisition_cc.cc + pcps_acquisition_fine_doppler_cc.cc + pcps_tong_acquisition_cc.cc + pcps_cccwsr_acquisition_cc.cc + galileo_pcps_8ms_acquisition_cc.cc + pcps_opencl_acquisition_cc.cc # Needs OpenCL + ) +else(OPENCL_FOUND) + set(ACQ_GR_BLOCKS_SOURCES + pcps_acquisition_cc.cc + pcps_multithread_acquisition_cc.cc + pcps_assisted_acquisition_cc.cc + pcps_acquisition_fine_doppler_cc.cc + pcps_tong_acquisition_cc.cc + pcps_cccwsr_acquisition_cc.cc + galileo_pcps_8ms_acquisition_cc.cc + ) +endif(OPENCL_FOUND) include_directories( $(CMAKE_CURRENT_SOURCE_DIR) @@ -37,6 +50,11 @@ include_directories( ${GNURADIO_RUNTIME_INCLUDE_DIRS} ) -add_library(acq_gr_blocks ${ACQ_GR_BLOCKS_SOURCES}) -target_link_libraries(acq_gr_blocks gnss_system_parameters ${GNURADIO_RUNTIME_LIBRARIES} ${GNURADIO_FFT_LIBRARIES} ${VOLK_LIBRARIES}) +if(OPENCL_FOUND) + include_directories( ${OPENCL_INCLUDE_DIRS} ) + set(OPT_LIBRARIES ${OPT_LIBRARIES} ${OPENCL_LIBRARIES}) +endif(OPENCL_FOUND) + +add_library(acq_gr_blocks ${ACQ_GR_BLOCKS_SOURCES}) +target_link_libraries(acq_gr_blocks gnss_sp_libs gnss_system_parameters ${GNURADIO_RUNTIME_LIBRARIES} ${GNURADIO_FFT_LIBRARIES} ${VOLK_LIBRARIES} ${OPT_LIBRARIES}) diff --git a/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc index d43abe9ff..2ac066a96 100644 --- a/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc +++ b/src/algorithms/acquisition/gnuradio_blocks/galileo_pcps_8ms_acquisition_cc.cc @@ -53,7 +53,6 @@ galileo_pcps_8ms_acquisition_cc_sptr galileo_pcps_8ms_make_acquisition_cc( samples_per_code, queue, dump, dump_filename)); } - galileo_pcps_8ms_acquisition_cc::galileo_pcps_8ms_acquisition_cc( unsigned int sampled_ms, unsigned int max_dwells, unsigned int doppler_max, long freq, long fs_in, @@ -84,7 +83,7 @@ galileo_pcps_8ms_acquisition_cc::galileo_pcps_8ms_acquisition_cc( //todo: do something if posix_memalign fails if (posix_memalign((void**)&d_fft_code_A, 16, d_fft_size * sizeof(gr_complex)) == 0){}; if (posix_memalign((void**)&d_fft_code_B, 16, d_fft_size * sizeof(gr_complex)) == 0){}; - if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(gr_complex)) == 0){}; + if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(float)) == 0){}; // Direct FFT d_fft_if = new gr::fft::fft_complex(d_fft_size, true); @@ -97,18 +96,14 @@ galileo_pcps_8ms_acquisition_cc::galileo_pcps_8ms_acquisition_cc( d_dump_filename = dump_filename; } - galileo_pcps_8ms_acquisition_cc::~galileo_pcps_8ms_acquisition_cc() { - - for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++) - { - free(d_grid_doppler_wipeoffs[doppler_index]); - } - - if (d_num_doppler_bins > 0) { + for (unsigned int i = 0; i < d_num_doppler_bins; i++) + { + free(d_grid_doppler_wipeoffs[i]); + } delete[] d_grid_doppler_wipeoffs; } @@ -125,10 +120,10 @@ galileo_pcps_8ms_acquisition_cc::~galileo_pcps_8ms_acquisition_cc() } } - void galileo_pcps_8ms_acquisition_cc::set_local_code(std::complex * code) { - memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size); + // code A: two replicas of a primary code + memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size); d_fft_if->execute(); // We need the FFT of local code @@ -142,7 +137,7 @@ void galileo_pcps_8ms_acquisition_cc::set_local_code(std::complex * code) volk_32fc_conjugate_32fc_a(d_fft_code_A,d_fft_if->get_outbuf(),d_fft_size); } - + // code B: two replicas of a primary code; the second replica is inverted. volk_32fc_s32fc_multiply_32fc_a(&(d_fft_if->get_inbuf())[d_samples_per_code], &code[d_samples_per_code], gr_complex(-1,0), d_samples_per_code); @@ -160,7 +155,6 @@ void galileo_pcps_8ms_acquisition_cc::set_local_code(std::complex * code) } } - void galileo_pcps_8ms_acquisition_cc::init() { d_gnss_synchro->Acq_delay_samples = 0.0; @@ -169,12 +163,16 @@ void galileo_pcps_8ms_acquisition_cc::init() d_mag = 0.0; d_input_power = 0.0; - // Create the carrier Doppler wipeoff signals - d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step); - for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step) + // Count the number of bins + d_num_doppler_bins = 0; + for (int doppler = (int)(-d_doppler_max); + doppler <= (int)d_doppler_max; + doppler += d_doppler_step) { d_num_doppler_bins++; } + + // Create the carrier Doppler wipeoff signals d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins]; for (unsigned int doppler_index=0;doppler_indexexecute(); // Multiply carrier wiped--off, Fourier transformed incoming signal - // with the local FFT'd code reference using SIMD operations with VOLK library + // with the local FFT'd code A reference using SIMD operations with + // VOLK library volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(), d_fft_if->get_outbuf(), d_fft_code_A, d_fft_size); @@ -279,7 +277,8 @@ int galileo_pcps_8ms_acquisition_cc::general_work(int noutput_items, magt_A = d_magnitude[indext_A] / (fft_normalization_factor * fft_normalization_factor); // Multiply carrier wiped--off, Fourier transformed incoming signal - // with the local FFT'd code reference using SIMD operations with VOLK library + // with the local FFT'd code B reference using SIMD operations with + // VOLK library volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(), d_fft_if->get_outbuf(), d_fft_code_B, d_fft_size); @@ -293,6 +292,7 @@ int galileo_pcps_8ms_acquisition_cc::general_work(int noutput_items, // Normalize the maximum value to correct the scale factor introduced by FFTW magt_B = d_magnitude[indext_B] / (fft_normalization_factor * fft_normalization_factor); + // Take the greater magnitude if (magt_A >= magt_B) { magt = magt_A; @@ -336,12 +336,9 @@ int galileo_pcps_8ms_acquisition_cc::general_work(int noutput_items, { d_state = 2; // Positive acquisition } - else + else if (d_well_count == d_max_dwells) { - if (d_well_count == d_max_dwells) - { - d_state = 3; // Negative acquisition - } + d_state = 3; // Negative acquisition } consume_each(1); diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition_cc.cc index 596e89884..745673807 100644 --- a/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition_cc.cc +++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_acquisition_cc.cc @@ -40,6 +40,7 @@ #include #include #include +#include using google::LogMessage; @@ -57,7 +58,6 @@ pcps_acquisition_cc_sptr pcps_make_acquisition_cc( samples_per_code, bit_transition_flag, queue, dump, dump_filename)); } - pcps_acquisition_cc::pcps_acquisition_cc( unsigned int sampled_ms, unsigned int max_dwells, unsigned int doppler_max, long freq, long fs_in, @@ -102,18 +102,14 @@ pcps_acquisition_cc::pcps_acquisition_cc( d_dump_filename = dump_filename; } - pcps_acquisition_cc::~pcps_acquisition_cc() { - - for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++) - { - free(d_grid_doppler_wipeoffs[doppler_index]); - } - - if (d_num_doppler_bins > 0) { + for (unsigned int i = 0; i < d_num_doppler_bins; i++) + { + free(d_grid_doppler_wipeoffs[i]); + } delete[] d_grid_doppler_wipeoffs; } @@ -129,7 +125,6 @@ pcps_acquisition_cc::~pcps_acquisition_cc() } } - void pcps_acquisition_cc::set_local_code(std::complex * code) { memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size); @@ -147,7 +142,6 @@ void pcps_acquisition_cc::set_local_code(std::complex * code) } } - void pcps_acquisition_cc::init() { d_gnss_synchro->Acq_delay_samples = 0.0; @@ -156,12 +150,16 @@ void pcps_acquisition_cc::init() d_mag = 0.0; d_input_power = 0.0; - // Create the carrier Doppler wipeoff signals - d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step); - for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step) + // Count the number of bins + d_num_doppler_bins = 0; + for (int doppler = (int)(-d_doppler_max); + doppler <= (int)d_doppler_max; + doppler += d_doppler_step) { d_num_doppler_bins++; } + + // Create the carrier Doppler wipeoff signals d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins]; for (unsigned int doppler_index=0;doppler_indexAcq_delay_samples = (double)(indext % d_samples_per_code); d_gnss_synchro->Acq_doppler_hz = (double)doppler; @@ -321,17 +313,14 @@ int pcps_acquisition_cc::general_work(int noutput_items, { d_state = 2; // Positive acquisition } - else + else if (d_well_count == d_max_dwells) { - if (d_well_count == d_max_dwells) - { - d_state = 3; // Negative acquisition - } + d_state = 3; // Negative acquisition } } else { - if (d_well_count == d_max_dwells) + if (d_well_count == d_max_dwells) // d_max_dwells = 2 { if (d_test_statistics > d_threshold) { diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc index dd4062a79..8e9747308 100644 --- a/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc +++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_cccwsr_acquisition_cc.cc @@ -60,7 +60,6 @@ pcps_cccwsr_acquisition_cc_sptr pcps_cccwsr_make_acquisition_cc( samples_per_ms, samples_per_code, queue, dump, dump_filename)); } - pcps_cccwsr_acquisition_cc::pcps_cccwsr_acquisition_cc( unsigned int sampled_ms, unsigned int max_dwells, unsigned int doppler_max, long freq, long fs_in, @@ -108,18 +107,14 @@ pcps_cccwsr_acquisition_cc::pcps_cccwsr_acquisition_cc( d_dump_filename = dump_filename; } - pcps_cccwsr_acquisition_cc::~pcps_cccwsr_acquisition_cc() { - - for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++) - { - free(d_grid_doppler_wipeoffs[doppler_index]); - } - - if (d_num_doppler_bins > 0) { + for (unsigned int i = 0; i < d_num_doppler_bins; i++) + { + free(d_grid_doppler_wipeoffs[i]); + } delete[] d_grid_doppler_wipeoffs; } @@ -140,10 +135,10 @@ pcps_cccwsr_acquisition_cc::~pcps_cccwsr_acquisition_cc() } } - void pcps_cccwsr_acquisition_cc::set_local_code(std::complex * code_data, std::complex * code_pilot) { + // Data code (E1B) memcpy(d_fft_if->get_inbuf(), code_data, sizeof(gr_complex)*d_fft_size); d_fft_if->execute(); // We need the FFT of local code @@ -158,6 +153,7 @@ void pcps_cccwsr_acquisition_cc::set_local_code(std::complex * code_data, volk_32fc_conjugate_32fc_a(d_fft_code_data,d_fft_if->get_outbuf(),d_fft_size); } + // Pilot code (E1C) memcpy(d_fft_if->get_inbuf(), code_pilot, sizeof(gr_complex)*d_fft_size); d_fft_if->execute(); // We need the FFT of local code @@ -173,7 +169,6 @@ void pcps_cccwsr_acquisition_cc::set_local_code(std::complex * code_data, } } - void pcps_cccwsr_acquisition_cc::init() { d_gnss_synchro->Acq_delay_samples = 0.0; @@ -182,12 +177,16 @@ void pcps_cccwsr_acquisition_cc::init() d_mag = 0.0; d_input_power = 0.0; - // Create the carrier Doppler wipeoff signals - d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step); - for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step) + // Count the number of bins + d_num_doppler_bins = 0; + for (int doppler = (int)(-d_doppler_max); + doppler <= (int)d_doppler_max; + doppler += d_doppler_step) { d_num_doppler_bins++; } + + // Create the carrier Doppler wipeoff signals d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins]; for (unsigned int doppler_index=0;doppler_indexexecute(); // Multiply carrier wiped--off, Fourier transformed incoming signal - // with the local FFT'd code reference {data+j*pilot} using SIMD operations with VOLK library + // with the local FFT'd data code reference (E1B) using SIMD operations + // with VOLK library volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(), d_fft_if->get_outbuf(), d_fft_code_data, d_fft_size); // compute the inverse FFT d_ifft->execute(); + // Copy the result of the correlation between wiped--off signal and data code in + // d_data_correlation. memcpy(d_data_correlation, d_ifft->get_outbuf(), sizeof(gr_complex)*d_fft_size); - + // Multiply carrier wiped--off, Fourier transformed incoming signal + // with the local FFT'd pilot code reference (E1C) using SIMD operations + // with VOLK library volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(), d_fft_if->get_outbuf(), d_fft_code_pilot, d_fft_size); + // Compute the inverse FFT d_ifft->execute(); + // Copy the result of the correlation between wiped--off signal and pilot code in + // d_data_correlation. memcpy(d_pilot_correlation, d_ifft->get_outbuf(), sizeof(gr_complex)*d_fft_size); for (unsigned int i = 0; i < d_fft_size; i++) @@ -354,14 +360,13 @@ int pcps_cccwsr_acquisition_cc::general_work(int noutput_items, { d_state = 2; // Positive acquisition } - else + else if (d_well_count == d_max_dwells) { - if (d_well_count == d_max_dwells) - { - d_state = 3; // Negative acquisition - } + d_state = 3; // Negative acquisition } + consume_each(1); + break; } diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.cc index 79f746b55..430bca07b 100644 --- a/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.cc +++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.cc @@ -57,7 +57,6 @@ pcps_multithread_acquisition_cc_sptr pcps_make_multithread_acquisition_cc( samples_per_code, bit_transition_flag, queue, dump, dump_filename)); } - pcps_multithread_acquisition_cc::pcps_multithread_acquisition_cc( unsigned int sampled_ms, unsigned int max_dwells, unsigned int doppler_max, long freq, long fs_in, @@ -72,6 +71,7 @@ pcps_multithread_acquisition_cc::pcps_multithread_acquisition_cc( d_sample_counter = 0; // SAMPLE COUNTER d_active = false; d_state = 0; + d_core_working = false; d_queue = queue; d_freq = freq; d_fs_in = fs_in; @@ -86,10 +86,19 @@ pcps_multithread_acquisition_cc::pcps_multithread_acquisition_cc( d_input_power = 0.0; d_num_doppler_bins = 0; d_bit_transition_flag = bit_transition_flag; + d_in_dwell_count = 0; + + d_in_buffer = new gr_complex*[d_max_dwells]; //todo: do something if posix_memalign fails + for (unsigned int i = 0; i < d_max_dwells; i++) + { + if (posix_memalign((void**)&d_in_buffer[i], 16, + d_fft_size * sizeof(gr_complex)) == 0){}; + + } if (posix_memalign((void**)&d_fft_codes, 16, d_fft_size * sizeof(gr_complex)) == 0){}; - if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(gr_complex)) == 0){}; + if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(float)) == 0){}; // Direct FFT d_fft_if = new gr::fft::fft_complex(d_fft_size, true); @@ -102,21 +111,23 @@ pcps_multithread_acquisition_cc::pcps_multithread_acquisition_cc( d_dump_filename = dump_filename; } - pcps_multithread_acquisition_cc::~pcps_multithread_acquisition_cc() { - - for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++) - { - free(d_grid_doppler_wipeoffs[doppler_index]); - } - - if (d_num_doppler_bins > 0) { + for (unsigned int i = 0; i < d_num_doppler_bins; i++) + { + free(d_grid_doppler_wipeoffs[i]); + } delete[] d_grid_doppler_wipeoffs; } + for (unsigned int i = 0; i < d_max_dwells; i++) + { + free(d_in_buffer[i]); + } + delete[] d_in_buffer; + free(d_fft_codes); free(d_magnitude); @@ -129,6 +140,35 @@ pcps_multithread_acquisition_cc::~pcps_multithread_acquisition_cc() } } +void pcps_multithread_acquisition_cc::init() +{ + d_gnss_synchro->Acq_delay_samples = 0.0; + d_gnss_synchro->Acq_doppler_hz = 0.0; + d_gnss_synchro->Acq_samplestamp_samples = 0; + d_mag = 0.0; + d_input_power = 0.0; + + // Count the number of bins + d_num_doppler_bins = 0; + for (int doppler = (int)(-d_doppler_max); + doppler <= (int)d_doppler_max; + doppler += d_doppler_step) + { + d_num_doppler_bins++; + } + + // Create the carrier Doppler wipeoff signals + d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins]; + for (unsigned int doppler_index=0;doppler_index * code) { @@ -147,13 +187,16 @@ void pcps_multithread_acquisition_cc::set_local_code(std::complex * code) } } -void pcps_multithread_acquisition_cc::perform_acquisition(const gr_complex* in, unsigned int samplestamp) +void pcps_multithread_acquisition_cc::acquisition_core() { // initialize acquisition algorithm int doppler; unsigned int indext = 0; float magt = 0.0; float fft_normalization_factor = (float)d_fft_size * (float)d_fft_size; + gr_complex* in = d_in_buffer[d_well_count]; + unsigned long int samplestamp = d_sample_counter_buffer[d_well_count]; + d_input_power = 0.0; d_mag = 0.0; @@ -204,7 +247,14 @@ void pcps_multithread_acquisition_cc::perform_acquisition(const gr_complex* in, { d_mag = magt; - if (d_test_statistics < (magt / d_input_power) || !d_bit_transition_flag) + // In case that d_bit_transition_flag = true, we compare the potentially + // new maximum test statistics (d_mag/d_input_power) with the value in + // d_test_statistics. When the second dwell is being processed, the value + // of d_mag/d_input_power could be lower than d_test_statistics (i.e, + // the maximum test statistics in the previous dwell is greater than + // current d_mag/d_input_power). Note that d_test_statistics is not + // restarted between consecutive dwells in multidwell operation. + if (d_test_statistics < (d_mag / d_input_power) || !d_bit_transition_flag) { d_gnss_synchro->Acq_delay_samples = (double)(indext % d_samples_per_code); d_gnss_synchro->Acq_doppler_hz = (double)doppler; @@ -235,68 +285,31 @@ void pcps_multithread_acquisition_cc::perform_acquisition(const gr_complex* in, { if (d_test_statistics > d_threshold) { - d_state = 3; // Positive acquisition + d_state = 2; // Positive acquisition } - else + else if (d_well_count == d_max_dwells) { - if (d_well_count == d_max_dwells) - { - d_state = 4; // Negative acquisition - } - else - { - d_state = 1; // Process next block - } + d_state = 3; // Negative acquisition } } else { - if (d_well_count == d_max_dwells) + if (d_well_count == d_max_dwells) // d_max_dwells = 2 { if (d_test_statistics > d_threshold) { - d_state = 3; // Positive acquisition + d_state = 2; // Positive acquisition } else { - d_state = 4; // Negative acquisition + d_state = 3; // Negative acquisition } } - else - { - d_state = 1; // Process next block - } } + + d_core_working = false; } - -void pcps_multithread_acquisition_cc::init() -{ - d_gnss_synchro->Acq_delay_samples = 0.0; - d_gnss_synchro->Acq_doppler_hz = 0.0; - d_gnss_synchro->Acq_samplestamp_samples = 0; - d_mag = 0.0; - d_input_power = 0.0; - - // Create the carrier Doppler wipeoff signals - d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step); - for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step) - { - d_num_doppler_bins++; - } - d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins]; - for (unsigned int doppler_index=0;doppler_index (int)num_dwells) + { + d_sample_counter += d_fft_size * (ninput_items[0]-num_dwells); + } + } + else + { + // We already have d_max_dwells consecutive blocks in the internal buffer, + // just skip input blocks. + d_sample_counter += d_fft_size * ninput_items[0]; + } + + // We create a new thread to process next block if the following + // conditions are fulfilled: + // 1. There are new blocks in d_in_buffer that have not been processed yet + // (d_well_count < d_in_dwell_count). + // 2. No other acquisition_core thead is working (!d_core_working). + // 3. d_state==1. We need to check again d_state because it can be modified at any + // moment by the external thread (may have changed since checked in the switch()). + // If the external thread has already declared positive (d_state=2) or negative + // (d_state=3) acquisition, we don't have to process next block!! + if ((d_well_count < d_in_dwell_count) && !d_core_working && d_state==1) + { + d_core_working = true; + boost::thread(&pcps_multithread_acquisition_cc::acquisition_core, this); + } break; } case 2: { - d_sample_counter += d_fft_size * ninput_items[0]; // sample counter - consume_each(ninput_items[0]); - break; - } - case 3: - { - // Declare positive acquisition using a message queue DLOG(INFO) << "positive acquisition"; DLOG(INFO) << "satellite " << d_gnss_synchro->System << " " << d_gnss_synchro->PRN; @@ -363,7 +405,6 @@ int pcps_multithread_acquisition_cc::general_work(int noutput_items, d_state = 0; d_sample_counter += d_fft_size * ninput_items[0]; // sample counter - consume_each(ninput_items[0]); acquisition_message = 1; d_channel_internal_queue->push(acquisition_message); @@ -371,7 +412,7 @@ int pcps_multithread_acquisition_cc::general_work(int noutput_items, break; } - case 4: + case 3: { // Declare negative acquisition using a message queue DLOG(INFO) << "negative acquisition"; @@ -388,7 +429,6 @@ int pcps_multithread_acquisition_cc::general_work(int noutput_items, d_state = 0; d_sample_counter += d_fft_size * ninput_items[0]; // sample counter - consume_each(ninput_items[0]); acquisition_message = 2; d_channel_internal_queue->push(acquisition_message); @@ -397,5 +437,7 @@ int pcps_multithread_acquisition_cc::general_work(int noutput_items, } } + consume_each(ninput_items[0]); + return 0; } diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.h b/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.h index d9a5ba8be..65df7ab0e 100644 --- a/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.h +++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_multithread_acquisition_cc.h @@ -134,9 +134,13 @@ private: std::ofstream d_dump_file; bool d_active; int d_state; + bool d_core_working; bool d_dump; unsigned int d_channel; std::string d_dump_filename; + gr_complex** d_in_buffer; + std::vector d_sample_counter_buffer; + unsigned int d_in_dwell_count; public: /*! @@ -237,7 +241,7 @@ public: gr_vector_const_void_star &input_items, gr_vector_void_star &output_items); - void perform_acquisition(const gr_complex* in, const unsigned int samplestamp); + void acquisition_core(); }; #endif /* GNSS_SDR_PCPS_MULTITHREAD_ACQUISITION_CC_H_*/ diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.cc new file mode 100644 index 000000000..aa74ce5c0 --- /dev/null +++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.cc @@ -0,0 +1,809 @@ +/*! + * \file pcps_opencl_acquisition_cc.cc + * \brief This class implements a Parallel Code Phase Search Acquisition + * using OpenCL to offload some functions to the GPU. + * + * Acquisition strategy (Kay Borre book + CFAR threshold). + *
    + *
  1. Compute the input signal power estimation + *
  2. Doppler serial search loop + *
  3. Perform the FFT-based circular convolution (parallel time search) + *
  4. Record the maximum peak and the associated synchronization parameters + *
  5. Compute the test statistics and compare to the threshold + *
  6. Declare positive or negative acquisition using a message queue + *
+ * + * Kay Borre book: K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * "A Software-Defined GPS and Galileo Receiver. A Single-Frequency + * Approach", Birkha user, 2007. pp 81-84 + * + * \authors
    + *
  • Javier Arribas, 2011. jarribas(at)cttc.es + *
  • Luis Esteve, 2012. luis(at)epsilon-formacion.com + *
  • Marc Molina, 2013. marc.molina.pena@gmail.com + *
+ * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#include "pcps_opencl_acquisition_cc.h" +#include "gnss_signal_processing.h" +#include "control_message_factory.h" +#include "fft_base_kernels.h" +#include "fft_internal.h" +#include +#include +#include +#include +#include +#include +#include +#include + +using google::LogMessage; + +pcps_opencl_acquisition_cc_sptr pcps_make_opencl_acquisition_cc( + unsigned int sampled_ms, unsigned int max_dwells, + unsigned int doppler_max, long freq, long fs_in, + int samples_per_ms, int samples_per_code, + bool bit_transition_flag, + gr::msg_queue::sptr queue, bool dump, + std::string dump_filename) +{ + + return pcps_opencl_acquisition_cc_sptr( + new pcps_opencl_acquisition_cc(sampled_ms, max_dwells, doppler_max, freq, fs_in, samples_per_ms, + samples_per_code, bit_transition_flag, queue, dump, dump_filename)); +} + +pcps_opencl_acquisition_cc::pcps_opencl_acquisition_cc( + unsigned int sampled_ms, unsigned int max_dwells, + unsigned int doppler_max, long freq, long fs_in, + int samples_per_ms, int samples_per_code, + bool bit_transition_flag, + gr::msg_queue::sptr queue, bool dump, + std::string dump_filename) : + gr::block("pcps_opencl_acquisition_cc", + gr::io_signature::make(1, 1, sizeof(gr_complex) * sampled_ms * samples_per_ms), + gr::io_signature::make(0, 0, sizeof(gr_complex) * sampled_ms * samples_per_ms)) +{ + d_sample_counter = 0; // SAMPLE COUNTER + d_active = false; + d_state = 0; + d_core_working = false; + d_queue = queue; + d_freq = freq; + d_fs_in = fs_in; + d_samples_per_ms = samples_per_ms; + d_samples_per_code = samples_per_code; + d_sampled_ms = sampled_ms; + d_max_dwells = max_dwells; + d_well_count = 0; + d_doppler_max = doppler_max; + d_fft_size = d_sampled_ms * d_samples_per_ms; + d_fft_size_pow2 = pow(2,ceil(log2(2*d_fft_size))); + d_mag = 0; + d_input_power = 0.0; + d_num_doppler_bins = 0; + d_bit_transition_flag = bit_transition_flag; + d_in_dwell_count = 0; + d_cl_fft_batch_size = 1; + + d_in_buffer = new gr_complex*[d_max_dwells]; + + //todo: do something if posix_memalign fails + for (unsigned int i = 0; i < d_max_dwells; i++) + { + if (posix_memalign((void**)&d_in_buffer[i], 16, + d_fft_size * sizeof(gr_complex)) == 0){}; + + } + if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(float)) == 0){}; + if (posix_memalign((void**)&d_fft_codes, 16, d_fft_size_pow2 * sizeof(gr_complex)) == 0){}; + if (posix_memalign((void**)&d_zero_vector, 16, (d_fft_size_pow2-d_fft_size) * sizeof(gr_complex)) == 0){}; + + for (unsigned int i = 0; i < (d_fft_size_pow2-d_fft_size); i++) + { + d_zero_vector[i] = gr_complex(0.0,0.0); + } + + d_opencl = init_opencl_environment("math_kernel.cl"); + + if (d_opencl != 0) + { + // Direct FFT + d_fft_if = new gr::fft::fft_complex(d_fft_size, true); + + // Inverse FFT + d_ifft = new gr::fft::fft_complex(d_fft_size, false); + } + + // For dumping samples into a file + d_dump = dump; + d_dump_filename = dump_filename; + +} + +pcps_opencl_acquisition_cc::~pcps_opencl_acquisition_cc() +{ + if (d_num_doppler_bins > 0) + { + for (unsigned int i = 0; i < d_num_doppler_bins; i++) + { + free(d_grid_doppler_wipeoffs[i]); + } + delete[] d_grid_doppler_wipeoffs; + } + + for (unsigned int i = 0; i < d_max_dwells; i++) + { + free(d_in_buffer[i]); + } + delete[] d_in_buffer; + + free(d_fft_codes); + free(d_magnitude); + free(d_zero_vector); + + if (d_opencl == 0) + { + delete d_cl_queue; + delete d_cl_buffer_in; + delete d_cl_buffer_1; + delete d_cl_buffer_2; + delete d_cl_buffer_magnitude; + delete d_cl_buffer_fft_codes; + if(d_num_doppler_bins > 0) + { + delete[] d_cl_buffer_grid_doppler_wipeoffs; + } + + clFFT_DestroyPlan(d_cl_fft_plan); + } + else + { + delete d_ifft; + delete d_fft_if; + } + + if (d_dump) + { + d_dump_file.close(); + } +} + +int pcps_opencl_acquisition_cc::init_opencl_environment(std::string kernel_filename) +{ + //get all platforms (drivers) + std::vector all_platforms; + cl::Platform::get(&all_platforms); + + if(all_platforms.size()==0) + { + std::cout << "No OpenCL platforms found. Check OpenCL installation!" << std::endl; + return 1; + } + + d_cl_platform = all_platforms[0]; //get default platform + std::cout << "Using platform: " << d_cl_platform.getInfo() + << std::endl; + + //get default GPU device of the default platform + std::vector gpu_devices; + d_cl_platform.getDevices(CL_DEVICE_TYPE_GPU, &gpu_devices); + + if(gpu_devices.size()==0) + { + std::cout << "No GPU devices found. Check OpenCL installation!" << std::endl; + return 2; + } + + d_cl_device = gpu_devices[0]; + + std::vector device; + device.push_back(d_cl_device); + std::cout << "Using device: " << d_cl_device.getInfo() << std::endl; + + cl::Context context(device); + d_cl_context = context; + + // build the program from the source in the file + std::ifstream kernel_file(kernel_filename, std::ifstream::in); + std::string kernel_code(std::istreambuf_iterator(kernel_file), + (std::istreambuf_iterator())); + kernel_file.close(); + +// std::cout << "Kernel code: \n" << kernel_code << std::endl; + + cl::Program::Sources sources; + + sources.push_back({kernel_code.c_str(),kernel_code.length()}); + + cl::Program program(context,sources); + if(program.build(device)!=CL_SUCCESS) + { + std::cout << " Error building: " + << program.getBuildInfo(device[0]) + << std::endl; + return 3; + } + d_cl_program = program; + + // create buffers on the device + d_cl_buffer_in = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(gr_complex)*d_fft_size); + d_cl_buffer_fft_codes = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(gr_complex)*d_fft_size_pow2); + d_cl_buffer_1 = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(gr_complex)*d_fft_size_pow2); + d_cl_buffer_2 = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(gr_complex)*d_fft_size_pow2); + d_cl_buffer_magnitude = new cl::Buffer(d_cl_context,CL_MEM_READ_WRITE,sizeof(float)*d_fft_size); + + //create queue to which we will push commands for the device. + d_cl_queue = new cl::CommandQueue(d_cl_context,d_cl_device); + + //create FFT plan + cl_int err; + clFFT_Dim3 dim = {d_fft_size_pow2, 1, 1}; + + d_cl_fft_plan = clFFT_CreatePlan(d_cl_context(), dim, clFFT_1D, + clFFT_InterleavedComplexFormat, &err); + + if (err != 0) + { + delete d_cl_queue; + delete d_cl_buffer_in; + delete d_cl_buffer_1; + delete d_cl_buffer_2; + delete d_cl_buffer_magnitude; + delete d_cl_buffer_fft_codes; + + std::cout << "Error creating OpenCL FFT plan." << std::endl; + return 4; + } + + return 0; +} + +void pcps_opencl_acquisition_cc::init() +{ + d_gnss_synchro->Acq_delay_samples = 0.0; + d_gnss_synchro->Acq_doppler_hz = 0.0; + d_gnss_synchro->Acq_samplestamp_samples = 0; + d_mag = 0.0; + d_input_power = 0.0; + + // Count the number of bins + d_num_doppler_bins = 0; + for (int doppler = (int)(-d_doppler_max); + doppler <= (int)d_doppler_max; + doppler += d_doppler_step) + { + d_num_doppler_bins++; + } + + // Create the carrier Doppler wipeoff signals + d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins]; + if (d_opencl == 0) + { + d_cl_buffer_grid_doppler_wipeoffs = new cl::Buffer*[d_num_doppler_bins]; + } + + for (unsigned int doppler_index=0;doppler_indexenqueueWriteBuffer(*(d_cl_buffer_grid_doppler_wipeoffs[doppler_index]), + CL_TRUE,0,sizeof(gr_complex)*d_fft_size, + d_grid_doppler_wipeoffs[doppler_index]); + } + } + + // zero padding in buffer_1 (FFT input) + if (d_opencl == 0) + { + d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_1,CL_TRUE,sizeof(gr_complex)*d_fft_size, + sizeof(gr_complex)*(d_fft_size_pow2-d_fft_size),d_zero_vector); + } +} + +void pcps_opencl_acquisition_cc::set_local_code(std::complex * code) +{ + if(d_opencl == 0) + { + d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_2,CL_TRUE,0, + sizeof(gr_complex)*d_fft_size, code); + + d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_2,CL_TRUE,sizeof(gr_complex)*d_fft_size, + sizeof(gr_complex)*(d_fft_size_pow2 - 2*d_fft_size), + d_zero_vector); + + d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_2,CL_TRUE,sizeof(gr_complex) + *(d_fft_size_pow2 - d_fft_size), + sizeof(gr_complex)*d_fft_size, code); + + clFFT_ExecuteInterleaved((*d_cl_queue)(), d_cl_fft_plan, d_cl_fft_batch_size, + clFFT_Forward, (*d_cl_buffer_2)(), (*d_cl_buffer_2)(), + 0, NULL, NULL); + + //Conjucate the local code + cl::Kernel kernel=cl::Kernel(d_cl_program,"conj_vector"); + kernel.setArg(0,*d_cl_buffer_2); //input + kernel.setArg(1,*d_cl_buffer_fft_codes); //output + d_cl_queue->enqueueNDRangeKernel(kernel,cl::NullRange,cl::NDRange(d_fft_size_pow2),cl::NullRange); + } + else + { + memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size); + + d_fft_if->execute(); // We need the FFT of local code + + //Conjugate the local code + if (is_unaligned()) + { + volk_32fc_conjugate_32fc_u(d_fft_codes,d_fft_if->get_outbuf(),d_fft_size); + } + else + { + volk_32fc_conjugate_32fc_a(d_fft_codes,d_fft_if->get_outbuf(),d_fft_size); + } + } +} + +void pcps_opencl_acquisition_cc::acquisition_core_volk() +{ + // initialize acquisition algorithm + int doppler; + unsigned int indext = 0; + float magt = 0.0; + float fft_normalization_factor = (float)d_fft_size * (float)d_fft_size; + gr_complex* in = d_in_buffer[d_well_count]; + unsigned long int samplestamp = d_sample_counter_buffer[d_well_count]; + + d_input_power = 0.0; + d_mag = 0.0; + + d_well_count++; + + DLOG(INFO) << "Channel: " << d_channel + << " , doing acquisition of satellite: " << d_gnss_synchro->System << " "<< d_gnss_synchro->PRN + << " ,sample stamp: " << d_sample_counter << ", threshold: " + << d_threshold << ", doppler_max: " << d_doppler_max + << ", doppler_step: " << d_doppler_step; + + // 1- Compute the input signal power estimation + volk_32fc_magnitude_squared_32f_a(d_magnitude, in, d_fft_size); + volk_32f_accumulator_s32f_a(&d_input_power, d_magnitude, d_fft_size); + d_input_power /= (float)d_fft_size; + + // 2- Doppler frequency search loop + for (unsigned int doppler_index=0;doppler_indexget_inbuf(), in, + d_grid_doppler_wipeoffs[doppler_index], d_fft_size); + + // 3- Perform the FFT-based convolution (parallel time search) + // Compute the FFT of the carrier wiped--off incoming signal + d_fft_if->execute(); + + // Multiply carrier wiped--off, Fourier transformed incoming signal + // with the local FFT'd code reference using SIMD operations with VOLK library + volk_32fc_x2_multiply_32fc_a(d_ifft->get_inbuf(), + d_fft_if->get_outbuf(), d_fft_codes, d_fft_size); + + // compute the inverse FFT + d_ifft->execute(); + + // Search maximum + volk_32fc_magnitude_squared_32f_a(d_magnitude, d_ifft->get_outbuf(), d_fft_size); + volk_32f_index_max_16u_a(&indext, d_magnitude, d_fft_size); + + // Normalize the maximum value to correct the scale factor introduced by FFTW + magt = d_magnitude[indext] / (fft_normalization_factor * fft_normalization_factor); + + // 4- record the maximum peak and the associated synchronization parameters + if (d_mag < magt) + { + d_mag = magt; + + // In case that d_bit_transition_flag = true, we compare the potentially + // new maximum test statistics (d_mag/d_input_power) with the value in + // d_test_statistics. When the second dwell is being processed, the value + // of d_mag/d_input_power could be lower than d_test_statistics (i.e, + // the maximum test statistics in the previous dwell is greater than + // current d_mag/d_input_power). Note that d_test_statistics is not + // restarted between consecutive dwells in multidwell operation. + if (d_test_statistics < (d_mag / d_input_power) || !d_bit_transition_flag) + { + d_gnss_synchro->Acq_delay_samples = (double)(indext % d_samples_per_code); + d_gnss_synchro->Acq_doppler_hz = (double)doppler; + d_gnss_synchro->Acq_samplestamp_samples = samplestamp; + + // 5- Compute the test statistics and compare to the threshold + //d_test_statistics = 2 * d_fft_size * d_mag / d_input_power; + d_test_statistics = d_mag / d_input_power; + } + } + + // Record results to file if required + if (d_dump) + { + std::stringstream filename; + std::streamsize n = 2 * sizeof(float) * (d_fft_size); // complex file write + filename.str(""); + filename << "../data/test_statistics_" << d_gnss_synchro->System + <<"_" << d_gnss_synchro->Signal << "_sat_" + << d_gnss_synchro->PRN << "_doppler_" << doppler << ".dat"; + d_dump_file.open(filename.str().c_str(), std::ios::out | std::ios::binary); + d_dump_file.write((char*)d_ifft->get_outbuf(), n); //write directly |abs(x)|^2 in this Doppler bin? + d_dump_file.close(); + } + } + + if (!d_bit_transition_flag) + { + if (d_test_statistics > d_threshold) + { + d_state = 2; // Positive acquisition + } + else if (d_well_count == d_max_dwells) + { + d_state = 3; // Negative acquisition + } + } + else + { + if (d_well_count == d_max_dwells) // d_max_dwells = 2 + { + if (d_test_statistics > d_threshold) + { + d_state = 2; // Positive acquisition + } + else + { + d_state = 3; // Negative acquisition + } + } + } + + d_core_working = false; +} + +void pcps_opencl_acquisition_cc::acquisition_core_opencl() +{ + // initialize acquisition algorithm + int doppler; + unsigned int indext = 0; + float magt = 0.0; + float fft_normalization_factor = ((float)d_fft_size_pow2 * (float)d_fft_size); //This works, but I am not sure why. + gr_complex* in = d_in_buffer[d_well_count]; + unsigned long int samplestamp = d_sample_counter_buffer[d_well_count]; + + d_input_power = 0.0; + d_mag = 0.0; + + // write input vector in buffer of OpenCL device + d_cl_queue->enqueueWriteBuffer(*d_cl_buffer_in,CL_TRUE,0,sizeof(gr_complex)*d_fft_size,in); + + d_well_count++; + +// struct timeval tv; +// long long int begin = 0; +// long long int end = 0; + +// gettimeofday(&tv, NULL); +// begin = tv.tv_sec *1e6 + tv.tv_usec; + + DLOG(INFO) << "Channel: " << d_channel + << " , doing acquisition of satellite: " << d_gnss_synchro->System << " "<< d_gnss_synchro->PRN + << " ,sample stamp: " << d_sample_counter << ", threshold: " + << d_threshold << ", doppler_max: " << d_doppler_max + << ", doppler_step: " << d_doppler_step; + + // 1- Compute the input signal power estimation + volk_32fc_magnitude_squared_32f_a(d_magnitude, in, d_fft_size); + volk_32f_accumulator_s32f_a(&d_input_power, d_magnitude, d_fft_size); + d_input_power /= (float)d_fft_size; + + cl::Kernel kernel; + + // 2- Doppler frequency search loop + for (unsigned int doppler_index=0;doppler_indexenqueueNDRangeKernel(kernel,cl::NullRange, cl::NDRange(d_fft_size), + cl::NullRange); + + // In the previous operation, we store the result in the first d_fft_size positions + // of d_cl_buffer_1. The rest d_fft_size_pow2-d_fft_size already have zeros + // (zero-padding is made in init() for optimization purposes). + + clFFT_ExecuteInterleaved((*d_cl_queue)(), d_cl_fft_plan, d_cl_fft_batch_size, + clFFT_Forward,(*d_cl_buffer_1)(), (*d_cl_buffer_2)(), + 0, NULL, NULL); + + // Multiply carrier wiped--off, Fourier transformed incoming signal + // with the local FFT'd code reference + kernel = cl::Kernel(d_cl_program,"mult_vectors"); + kernel.setArg(0,*d_cl_buffer_2); //input 1 + kernel.setArg(1,*d_cl_buffer_fft_codes); //input 2 + kernel.setArg(2,*d_cl_buffer_2); //output + d_cl_queue->enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(d_fft_size_pow2), + cl::NullRange); + + // compute the inverse FFT + clFFT_ExecuteInterleaved((*d_cl_queue)(), d_cl_fft_plan, d_cl_fft_batch_size, + clFFT_Inverse, (*d_cl_buffer_2)(), (*d_cl_buffer_2)(), + 0, NULL, NULL); + + // Compute magnitude + kernel = cl::Kernel(d_cl_program,"magnitude_squared"); + kernel.setArg(0,*d_cl_buffer_2); //input 1 + kernel.setArg(1,*d_cl_buffer_magnitude); //output + d_cl_queue->enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(d_fft_size), + cl::NullRange); + + // This is the only function that blocks this thread until all previously enqueued + // OpenCL commands are completed. + d_cl_queue->enqueueReadBuffer(*d_cl_buffer_magnitude, CL_TRUE, 0, + sizeof(float)*d_fft_size,d_magnitude); + + // Search maximum + // @TODO: find an efficient way to search the maximum with OpenCL in the GPU. + volk_32f_index_max_16u_a(&indext, d_magnitude, d_fft_size); + + // Normalize the maximum value to correct the scale factor introduced by FFTW + magt = d_magnitude[indext] / (fft_normalization_factor * fft_normalization_factor); + + // 4- record the maximum peak and the associated synchronization parameters + if (d_mag < magt) + { + d_mag = magt; + + // In case that d_bit_transition_flag = true, we compare the potentially + // new maximum test statistics (d_mag/d_input_power) with the value in + // d_test_statistics. When the second dwell is being processed, the value + // of d_mag/d_input_power could be lower than d_test_statistics (i.e, + // the maximum test statistics in the previous dwell is greater than + // current d_mag/d_input_power). Note that d_test_statistics is not + // restarted between consecutive dwells in multidwell operation. + if (d_test_statistics < (d_mag / d_input_power) || !d_bit_transition_flag) + { + d_gnss_synchro->Acq_delay_samples = (double)(indext % d_samples_per_code); + d_gnss_synchro->Acq_doppler_hz = (double)doppler; + d_gnss_synchro->Acq_samplestamp_samples = samplestamp; + + // 5- Compute the test statistics and compare to the threshold + //d_test_statistics = 2 * d_fft_size * d_mag / d_input_power; + d_test_statistics = d_mag / d_input_power; + } + } + + // Record results to file if required + if (d_dump) + { + std::stringstream filename; + std::streamsize n = 2 * sizeof(float) * (d_fft_size); // complex file write + filename.str(""); + filename << "../data/test_statistics_" << d_gnss_synchro->System + <<"_" << d_gnss_synchro->Signal << "_sat_" + << d_gnss_synchro->PRN << "_doppler_" << doppler << ".dat"; + d_dump_file.open(filename.str().c_str(), std::ios::out | std::ios::binary); + d_dump_file.write((char*)d_ifft->get_outbuf(), n); //write directly |abs(x)|^2 in this Doppler bin? + d_dump_file.close(); + } + } + +// gettimeofday(&tv, NULL); +// end = tv.tv_sec *1e6 + tv.tv_usec; +// std::cout << "Acq time = " << (end-begin) << " us" << std::endl; + + if (!d_bit_transition_flag) + { + if (d_test_statistics > d_threshold) + { + d_state = 2; // Positive acquisition + } + else if (d_well_count == d_max_dwells) + { + d_state = 3; // Negative acquisition + } + } + else + { + if (d_well_count == d_max_dwells) // d_max_dwells = 2 + { + if (d_test_statistics > d_threshold) + { + d_state = 2; // Positive acquisition + } + else + { + d_state = 3; // Negative acquisition + } + } + } + + d_core_working = false; +} + +int pcps_opencl_acquisition_cc::general_work(int noutput_items, + gr_vector_int &ninput_items, gr_vector_const_void_star &input_items, + gr_vector_void_star &output_items) +{ + + int acquisition_message = -1; //0=STOP_CHANNEL 1=ACQ_SUCCEES 2=ACQ_FAIL + + switch (d_state) + { + case 0: + { + if (d_active) + { + //restart acquisition variables + d_gnss_synchro->Acq_delay_samples = 0.0; + d_gnss_synchro->Acq_doppler_hz = 0.0; + d_gnss_synchro->Acq_samplestamp_samples = 0; + d_well_count = 0; + d_mag = 0.0; + d_input_power = 0.0; + d_test_statistics = 0.0; + d_in_dwell_count = 0; + d_sample_counter_buffer.clear(); + + d_state = 1; + } + + d_sample_counter += d_fft_size * ninput_items[0]; // sample counter + + break; + } + + case 1: + { + if (d_in_dwell_count < d_max_dwells) + { + // Fill internal buffer with d_max_dwells signal blocks. This step ensures that + // consecutive signal blocks will be processed in multi-dwell operation. This is + // essential when d_bit_transition_flag = true. + unsigned int num_dwells = std::min((int)(d_max_dwells-d_in_dwell_count),ninput_items[0]); + for (unsigned int i = 0; i < num_dwells; i++) + { + memcpy(d_in_buffer[d_in_dwell_count++], (gr_complex*)input_items[i], + sizeof(gr_complex)*d_fft_size); + d_sample_counter += d_fft_size; + d_sample_counter_buffer.push_back(d_sample_counter); + } + + if (ninput_items[0] > (int)num_dwells) + { + d_sample_counter += d_fft_size * (ninput_items[0]-num_dwells); + } + } + else + { + // We already have d_max_dwells consecutive blocks in the internal buffer, + // just skip input blocks. + d_sample_counter += d_fft_size * ninput_items[0]; + } + + // We create a new thread to process next block if the following + // conditions are fulfilled: + // 1. There are new blocks in d_in_buffer that have not been processed yet + // (d_well_count < d_in_dwell_count). + // 2. No other acquisition_core thead is working (!d_core_working). + // 3. d_state==1. We need to check again d_state because it can be modified at any + // moment by the external thread (may have changed since checked in the switch()). + // If the external thread has already declared positive (d_state=2) or negative + // (d_state=3) acquisition, we don't have to process next block!! + if ((d_well_count < d_in_dwell_count) && !d_core_working && d_state==1) + { + d_core_working = true; + if (d_opencl == 0) + { // Use OpenCL implementation + boost::thread(&pcps_opencl_acquisition_cc::acquisition_core_opencl, this); + } + else + { // Use Volk implementation + boost::thread(&pcps_opencl_acquisition_cc::acquisition_core_volk, this); + } + } + + break; + } + + case 2: + { + // Declare positive acquisition using a message queue + DLOG(INFO) << "positive acquisition"; + DLOG(INFO) << "satellite " << d_gnss_synchro->System << " " << d_gnss_synchro->PRN; + DLOG(INFO) << "sample_stamp " << d_sample_counter; + DLOG(INFO) << "test statistics value " << d_test_statistics; + DLOG(INFO) << "test statistics threshold " << d_threshold; + DLOG(INFO) << "code phase " << d_gnss_synchro->Acq_delay_samples; + DLOG(INFO) << "doppler " << d_gnss_synchro->Acq_doppler_hz; + DLOG(INFO) << "magnitude " << d_mag; + DLOG(INFO) << "input signal power " << d_input_power; + + d_active = false; + d_state = 0; + + d_sample_counter += d_fft_size * ninput_items[0]; // sample counter + + acquisition_message = 1; + d_channel_internal_queue->push(acquisition_message); + + break; + } + + case 3: + { + // Declare negative acquisition using a message queue + DLOG(INFO) << "negative acquisition"; + DLOG(INFO) << "satellite " << d_gnss_synchro->System << " " << d_gnss_synchro->PRN; + DLOG(INFO) << "sample_stamp " << d_sample_counter; + DLOG(INFO) << "test statistics value " << d_test_statistics; + DLOG(INFO) << "test statistics threshold " << d_threshold; + DLOG(INFO) << "code phase " << d_gnss_synchro->Acq_delay_samples; + DLOG(INFO) << "doppler " << d_gnss_synchro->Acq_doppler_hz; + DLOG(INFO) << "magnitude " << d_mag; + DLOG(INFO) << "input signal power " << d_input_power; + + d_active = false; + d_state = 0; + + d_sample_counter += d_fft_size * ninput_items[0]; // sample counter + + acquisition_message = 2; + d_channel_internal_queue->push(acquisition_message); + + break; + } + } + + consume_each(ninput_items[0]); + + return 0; +} diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.h b/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.h new file mode 100644 index 000000000..b2648c1db --- /dev/null +++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_opencl_acquisition_cc.h @@ -0,0 +1,277 @@ +/*! + * \file pcps_opencl_acquisition_cc.h + * \brief This class implements a Parallel Code Phase Search Acquisition + * using OpenCL to offload some functions to the GPU. + * + * Acquisition strategy (Kay Borre book + CFAR threshold). + *
    + *
  1. Compute the input signal power estimation + *
  2. Doppler serial search loop + *
  3. Perform the FFT-based circular convolution (parallel time search) + *
  4. Record the maximum peak and the associated synchronization parameters + *
  5. Compute the test statistics and compare to the threshold + *
  6. Declare positive or negative acquisition using a message queue + *
+ * + * Kay Borre book: K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * "A Software-Defined GPS and Galileo Receiver. A Single-Frequency + * Approach", Birkha user, 2007. pp 81-84 + * + * \authors
    + *
  • Javier Arribas, 2011. jarribas(at)cttc.es + *
  • Luis Esteve, 2012. luis(at)epsilon-formacion.com + *
  • Marc Molina, 2013. marc.molina.pena@gmail.com + *
+ * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef GNSS_SDR_PCPS_OPENCL_ACQUISITION_CC_H_ +#define GNSS_SDR_PCPS_OPENCL_ACQUISITION_CC_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrent_queue.h" +#include "fft_internal.h" +#include "gnss_synchro.h" + +#ifdef APPLE + #include +#else + #include +#endif + +class pcps_opencl_acquisition_cc; + +typedef boost::shared_ptr pcps_opencl_acquisition_cc_sptr; + +pcps_opencl_acquisition_cc_sptr +pcps_make_opencl_acquisition_cc(unsigned int sampled_ms, unsigned int max_dwells, + unsigned int doppler_max, long freq, long fs_in, + int samples_per_ms, int samples_per_code, + bool bit_transition_flag, + gr::msg_queue::sptr queue, bool dump, + std::string dump_filename); + +/*! + * \brief This class implements a Parallel Code Phase Search Acquisition. + * + * Check \ref Navitec2012 "An Open Source Galileo E1 Software Receiver", + * Algorithm 1, for a pseudocode description of this implementation. + */ +class pcps_opencl_acquisition_cc: public gr::block +{ +private: + friend pcps_opencl_acquisition_cc_sptr + pcps_make_opencl_acquisition_cc(unsigned int sampled_ms, unsigned int max_dwells, + unsigned int doppler_max, long freq, long fs_in, + int samples_per_ms, int samples_per_code, + bool bit_transition_flag, + gr::msg_queue::sptr queue, bool dump, + std::string dump_filename); + + + pcps_opencl_acquisition_cc(unsigned int sampled_ms, unsigned int max_dwells, + unsigned int doppler_max, long freq, long fs_in, + int samples_per_ms, int samples_per_code, + bool bit_transition_flag, + gr::msg_queue::sptr queue, bool dump, + std::string dump_filename); + + void calculate_magnitudes(gr_complex* fft_begin, int doppler_shift, + int doppler_offset); + + int init_opencl_environment(std::string kernel_filename); + + long d_fs_in; + long d_freq; + int d_samples_per_ms; + int d_samples_per_code; + unsigned int d_doppler_resolution; + float d_threshold; + std::string d_satellite_str; + unsigned int d_doppler_max; + unsigned int d_doppler_step; + unsigned int d_sampled_ms; + unsigned int d_max_dwells; + unsigned int d_well_count; + unsigned int d_fft_size; + unsigned int d_fft_size_pow2; + int* d_max_doppler_indexs; + unsigned long int d_sample_counter; + gr_complex** d_grid_doppler_wipeoffs; + unsigned int d_num_doppler_bins; + gr_complex* d_fft_codes; + gr::fft::fft_complex* d_fft_if; + gr::fft::fft_complex* d_ifft; + Gnss_Synchro *d_gnss_synchro; + unsigned int d_code_phase; + float d_doppler_freq; + float d_mag; + float* d_magnitude; + float d_input_power; + float d_test_statistics; + bool d_bit_transition_flag; + gr::msg_queue::sptr d_queue; + concurrent_queue *d_channel_internal_queue; + std::ofstream d_dump_file; + bool d_active; + int d_state; + bool d_core_working; + bool d_dump; + unsigned int d_channel; + std::string d_dump_filename; + gr_complex* d_zero_vector; + gr_complex** d_in_buffer; + std::vector d_sample_counter_buffer; + unsigned int d_in_dwell_count; + + cl::Platform d_cl_platform; + cl::Device d_cl_device; + cl::Context d_cl_context; + cl::Program d_cl_program; + cl::Buffer* d_cl_buffer_in; + cl::Buffer* d_cl_buffer_fft_codes; + cl::Buffer* d_cl_buffer_1; + cl::Buffer* d_cl_buffer_2; + cl::Buffer* d_cl_buffer_magnitude; + cl::Buffer** d_cl_buffer_grid_doppler_wipeoffs; + cl::CommandQueue* d_cl_queue; + clFFT_Plan d_cl_fft_plan; + cl_int d_cl_fft_batch_size; + + int d_opencl; + +public: + /*! + * \brief Default destructor. + */ + ~pcps_opencl_acquisition_cc(); + + /*! + * \brief Set acquisition/tracking common Gnss_Synchro object pointer + * to exchange synchronization data between acquisition and tracking blocks. + * \param p_gnss_synchro Satellite information shared by the processing blocks. + */ + void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro) + { + d_gnss_synchro = p_gnss_synchro; + } + + /*! + * \brief Returns the maximum peak of grid search. + */ + unsigned int mag() + { + return d_mag; + } + + /*! + * \brief Initializes acquisition algorithm. + */ + void init(); + + /*! + * \brief Sets local code for PCPS acquisition algorithm. + * \param code - Pointer to the PRN code. + */ + void set_local_code(std::complex * code); + + /*! + * \brief Starts acquisition algorithm, turning from standby mode to + * active mode + * \param active - bool that activates/deactivates the block. + */ + void set_active(bool active) + { + d_active = active; + } + + /*! + * \brief Set acquisition channel unique ID + * \param channel - receiver channel. + */ + void set_channel(unsigned int channel) + { + d_channel = channel; + } + + /*! + * \brief Set statistics threshold of PCPS algorithm. + * \param threshold - Threshold for signal detection (check \ref Navitec2012, + * Algorithm 1, for a definition of this threshold). + */ + void set_threshold(float threshold) + { + d_threshold = threshold; + } + + /*! + * \brief Set maximum Doppler grid search + * \param doppler_max - Maximum Doppler shift considered in the grid search [Hz]. + */ + void set_doppler_max(unsigned int doppler_max) + { + d_doppler_max = doppler_max; + } + + /*! + * \brief Set Doppler steps for the grid search + * \param doppler_step - Frequency bin of the search grid [Hz]. + */ + void set_doppler_step(unsigned int doppler_step) + { + d_doppler_step = doppler_step; + } + + + /*! + * \brief Set tracking channel internal queue. + * \param channel_internal_queue - Channel's internal blocks information queue. + */ + void set_channel_queue(concurrent_queue *channel_internal_queue) + { + d_channel_internal_queue = channel_internal_queue; + } + + /*! + * \brief Parallel Code Phase Search Acquisition signal processing. + */ + int general_work(int noutput_items, gr_vector_int &ninput_items, + gr_vector_const_void_star &input_items, + gr_vector_void_star &output_items); + + void acquisition_core_volk(); + + void acquisition_core_opencl(); +}; + +#endif /* GNSS_SDR_pcps_opencl_acquisition_cc_H_*/ diff --git a/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc b/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc index 5adc1aaec..1eb3b3f09 100644 --- a/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc +++ b/src/algorithms/acquisition/gnuradio_blocks/pcps_tong_acquisition_cc.cc @@ -71,7 +71,6 @@ pcps_tong_acquisition_cc_sptr pcps_tong_make_acquisition_cc( tong_init_val, tong_max_val, queue, dump, dump_filename)); } - pcps_tong_acquisition_cc::pcps_tong_acquisition_cc( unsigned int sampled_ms, unsigned int doppler_max, long freq, long fs_in, int samples_per_ms, @@ -103,7 +102,7 @@ pcps_tong_acquisition_cc::pcps_tong_acquisition_cc( //todo: do something if posix_memalign fails if (posix_memalign((void**)&d_fft_codes, 16, d_fft_size * sizeof(gr_complex)) == 0){}; - if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(gr_complex)) == 0){}; + if (posix_memalign((void**)&d_magnitude, 16, d_fft_size * sizeof(float)) == 0){}; // Direct FFT d_fft_if = new gr::fft::fft_complex(d_fft_size, true); @@ -116,19 +115,15 @@ pcps_tong_acquisition_cc::pcps_tong_acquisition_cc( d_dump_filename = dump_filename; } - pcps_tong_acquisition_cc::~pcps_tong_acquisition_cc() { - - for (unsigned int doppler_index = 0; doppler_index < d_num_doppler_bins; doppler_index++) - { - free(d_grid_doppler_wipeoffs[doppler_index]); - free(d_grid_data[doppler_index]); - } - - if (d_num_doppler_bins > 0) { + for (unsigned int i = 0; i < d_num_doppler_bins; i++) + { + free(d_grid_doppler_wipeoffs[i]); + free(d_grid_data[i]); + } delete[] d_grid_doppler_wipeoffs; delete[] d_grid_data; } @@ -145,7 +140,6 @@ pcps_tong_acquisition_cc::~pcps_tong_acquisition_cc() } } - void pcps_tong_acquisition_cc::set_local_code(std::complex * code) { memcpy(d_fft_if->get_inbuf(), code, sizeof(gr_complex)*d_fft_size); @@ -163,7 +157,6 @@ void pcps_tong_acquisition_cc::set_local_code(std::complex * code) } } - void pcps_tong_acquisition_cc::init() { d_gnss_synchro->Acq_delay_samples = 0.0; @@ -172,12 +165,16 @@ void pcps_tong_acquisition_cc::init() d_mag = 0.0; d_input_power = 0.0; - // Create the carrier Doppler wipeoff signals - d_num_doppler_bins = 0;//floor(2*std::abs((int)d_doppler_max)/d_doppler_step); - for (int doppler = (int)(-d_doppler_max); doppler <= (int)d_doppler_max; doppler += d_doppler_step) + // Count the number of bins + d_num_doppler_bins = 0; + for (int doppler = (int)(-d_doppler_max); + doppler <= (int)d_doppler_max; + doppler += d_doppler_step) { d_num_doppler_bins++; } + + // Create the carrier Doppler wipeoff signals and allocate data grid. d_grid_doppler_wipeoffs = new gr_complex*[d_num_doppler_bins]; d_grid_data = new float*[d_num_doppler_bins]; for (unsigned int doppler_index=0;doppler_indexexecute(); - // Search maximum + // Compute magnitude volk_32fc_magnitude_squared_32f_a(d_magnitude, d_ifft->get_outbuf(), d_fft_size); + // Compute vector of test statistics corresponding to current doppler index. volk_32f_s32f_multiply_32f_a(d_magnitude, d_magnitude, 1/(fft_normalization_factor*fft_normalization_factor*d_input_power), d_fft_size); + // Accumulate test statistics in d_grid_data. volk_32f_x2_add_32f_a(d_grid_data[doppler_index], d_magnitude, d_grid_data[doppler_index], d_fft_size); + // Search maximum volk_32f_index_max_16u_a(&indext, d_grid_data[doppler_index], d_fft_size); - // Normalize the maximum value to correct the scale factor introduced by FFTW magt = d_grid_data[doppler_index][indext]; // 4- record the maximum peak and the associated synchronization parameters @@ -328,7 +326,6 @@ int pcps_tong_acquisition_cc::general_work(int noutput_items, } // 5- Compute the test statistics and compare to the threshold - //d_test_statistics = 2 * d_fft_size * d_mag / d_input_power; d_test_statistics = d_mag; if (d_test_statistics > d_threshold*d_well_count) diff --git a/src/algorithms/libs/CMakeLists.txt b/src/algorithms/libs/CMakeLists.txt index 58617d726..25d3b314f 100644 --- a/src/algorithms/libs/CMakeLists.txt +++ b/src/algorithms/libs/CMakeLists.txt @@ -25,16 +25,31 @@ # pass_through.cc #) #else(CMAKE_CXX_COMPILER_ID MATCHES "Clang") -set(GNSS_SPLIBS_SOURCES - galileo_e1_signal_processing.cc - gnss_sdr_valve.cc - gnss_signal_processing.cc - gps_sdr_signal_processing.cc - nco_lib.cc - pass_through.cc -) #endif(CMAKE_CXX_COMPILER_ID MATCHES "Clang") +if(OPENCL_FOUND) + set(GNSS_SPLIBS_SOURCES + galileo_e1_signal_processing.cc + gnss_sdr_valve.cc + gnss_signal_processing.cc + gps_sdr_signal_processing.cc + nco_lib.cc + pass_through.cc + fft_execute.cc # Needs OpenCL + fft_setup.cc # Needs OpenCL + fft_kernelstring.cc # Needs OpenCL + ) +else(OPENCL_FOUND) + set(GNSS_SPLIBS_SOURCES + galileo_e1_signal_processing.cc + gnss_sdr_valve.cc + gnss_signal_processing.cc + gps_sdr_signal_processing.cc + nco_lib.cc + pass_through.cc + ) +endif(OPENCL_FOUND) + include_directories( $(CMAKE_CURRENT_SOURCE_DIR) ${CMAKE_SOURCE_DIR}/src/core/system_parameters @@ -45,5 +60,10 @@ include_directories( ${GFlags_INCLUDE_DIRS} ) +if(OPENCL_FOUND) + include_directories( ${OPENCL_INCLUDE_DIRS} ) + set(OPT_LIBRARIES ${OPT_LIBRARIES} ${OPENCL_LIBRARIES}) +endif(OPENCL_FOUND) + add_library(gnss_sp_libs ${GNSS_SPLIBS_SOURCES}) -target_link_libraries(gnss_sp_libs ${GNURADIO_RUNTIME_LIBRARIES} ${GNURADIO_BLOCKS_LIBRARIES} ${GNURADIO_FFT_LIBRARIES} ${GNURADIO_FILTER_LIBRARIES} gnss_rx) +target_link_libraries(gnss_sp_libs ${GNURADIO_RUNTIME_LIBRARIES} ${GNURADIO_BLOCKS_LIBRARIES} ${GNURADIO_FFT_LIBRARIES} ${GNURADIO_FILTER_LIBRARIES} ${OPT_LIBRARIES} gnss_rx) diff --git a/src/algorithms/libs/clFFT.h b/src/algorithms/libs/clFFT.h new file mode 100644 index 000000000..994d67613 --- /dev/null +++ b/src/algorithms/libs/clFFT.h @@ -0,0 +1,134 @@ + +// +// File: clFFT.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CLFFT_H +#define __CLFFT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#ifdef APPLE + #include +#else + #include +#endif + +// XForm type +typedef enum +{ + clFFT_Forward = -1, + clFFT_Inverse = 1 + +}clFFT_Direction; + +// XForm dimension +typedef enum +{ + clFFT_1D = 0, + clFFT_2D = 1, + clFFT_3D = 3 + +}clFFT_Dimension; + +// XForm Data type +typedef enum +{ + clFFT_SplitComplexFormat = 0, + clFFT_InterleavedComplexFormat = 1 +}clFFT_DataFormat; + +typedef struct +{ + unsigned int x; + unsigned int y; + unsigned int z; +}clFFT_Dim3; + +typedef struct +{ + float *real; + float *imag; +} clFFT_SplitComplex; + +typedef struct +{ + float real; + float imag; +}clFFT_Complex; + +typedef void* clFFT_Plan; + +clFFT_Plan clFFT_CreatePlan( cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ); + +void clFFT_DestroyPlan( clFFT_Plan plan ); + +cl_int clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in, cl_mem data_out, + cl_int num_events, cl_event *event_list, cl_event *event ); + +cl_int clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, + cl_int num_events, cl_event *event_list, cl_event *event ); + +cl_int clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + + +cl_int clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, + size_t numRows, size_t numCols, size_t startRow, size_t rowsToProcess, clFFT_Direction dir); + +void clFFT_DumpPlan( clFFT_Plan plan, FILE *file); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/algorithms/libs/fft_base_kernels.h b/src/algorithms/libs/fft_base_kernels.h new file mode 100644 index 000000000..101795697 --- /dev/null +++ b/src/algorithms/libs/fft_base_kernels.h @@ -0,0 +1,277 @@ + +// +// File: fft_base_kernels.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CL_FFT_BASE_KERNELS_ +#define __CL_FFT_BASE_KERNELS_ + +#include + +using namespace std; + +static string baseKernels = string( + "#ifndef M_PI\n" + "#define M_PI 0x1.921fb54442d18p+1\n" + "#endif\n" + "#define complexMul(a,b) ((float2)(mad(-(a).y, (b).y, (a).x * (b).x), mad((a).y, (b).x, (a).x * (b).y)))\n" + "#define conj(a) ((float2)((a).x, -(a).y))\n" + "#define conjTransp(a) ((float2)(-(a).y, (a).x))\n" + "\n" + "#define fftKernel2(a,dir) \\\n" + "{ \\\n" + " float2 c = (a)[0]; \\\n" + " (a)[0] = c + (a)[1]; \\\n" + " (a)[1] = c - (a)[1]; \\\n" + "}\n" + "\n" + "#define fftKernel2S(d1,d2,dir) \\\n" + "{ \\\n" + " float2 c = (d1); \\\n" + " (d1) = c + (d2); \\\n" + " (d2) = c - (d2); \\\n" + "}\n" + "\n" + "#define fftKernel4(a,dir) \\\n" + "{ \\\n" + " fftKernel2S((a)[0], (a)[2], dir); \\\n" + " fftKernel2S((a)[1], (a)[3], dir); \\\n" + " fftKernel2S((a)[0], (a)[1], dir); \\\n" + " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" + " fftKernel2S((a)[2], (a)[3], dir); \\\n" + " float2 c = (a)[1]; \\\n" + " (a)[1] = (a)[2]; \\\n" + " (a)[2] = c; \\\n" + "}\n" + "\n" + "#define fftKernel4s(a0,a1,a2,a3,dir) \\\n" + "{ \\\n" + " fftKernel2S((a0), (a2), dir); \\\n" + " fftKernel2S((a1), (a3), dir); \\\n" + " fftKernel2S((a0), (a1), dir); \\\n" + " (a3) = (float2)(dir)*(conjTransp((a3))); \\\n" + " fftKernel2S((a2), (a3), dir); \\\n" + " float2 c = (a1); \\\n" + " (a1) = (a2); \\\n" + " (a2) = c; \\\n" + "}\n" + "\n" + "#define bitreverse8(a) \\\n" + "{ \\\n" + " float2 c; \\\n" + " c = (a)[1]; \\\n" + " (a)[1] = (a)[4]; \\\n" + " (a)[4] = c; \\\n" + " c = (a)[3]; \\\n" + " (a)[3] = (a)[6]; \\\n" + " (a)[6] = c; \\\n" + "}\n" + "\n" + "#define fftKernel8(a,dir) \\\n" + "{ \\\n" + " const float2 w1 = (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" + " const float2 w3 = (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f); \\\n" + " float2 c; \\\n" + " fftKernel2S((a)[0], (a)[4], dir); \\\n" + " fftKernel2S((a)[1], (a)[5], dir); \\\n" + " fftKernel2S((a)[2], (a)[6], dir); \\\n" + " fftKernel2S((a)[3], (a)[7], dir); \\\n" + " (a)[5] = complexMul(w1, (a)[5]); \\\n" + " (a)[6] = (float2)(dir)*(conjTransp((a)[6])); \\\n" + " (a)[7] = complexMul(w3, (a)[7]); \\\n" + " fftKernel2S((a)[0], (a)[2], dir); \\\n" + " fftKernel2S((a)[1], (a)[3], dir); \\\n" + " fftKernel2S((a)[4], (a)[6], dir); \\\n" + " fftKernel2S((a)[5], (a)[7], dir); \\\n" + " (a)[3] = (float2)(dir)*(conjTransp((a)[3])); \\\n" + " (a)[7] = (float2)(dir)*(conjTransp((a)[7])); \\\n" + " fftKernel2S((a)[0], (a)[1], dir); \\\n" + " fftKernel2S((a)[2], (a)[3], dir); \\\n" + " fftKernel2S((a)[4], (a)[5], dir); \\\n" + " fftKernel2S((a)[6], (a)[7], dir); \\\n" + " bitreverse8((a)); \\\n" + "}\n" + "\n" + "#define bitreverse4x4(a) \\\n" + "{ \\\n" + " float2 c; \\\n" + " c = (a)[1]; (a)[1] = (a)[4]; (a)[4] = c; \\\n" + " c = (a)[2]; (a)[2] = (a)[8]; (a)[8] = c; \\\n" + " c = (a)[3]; (a)[3] = (a)[12]; (a)[12] = c; \\\n" + " c = (a)[6]; (a)[6] = (a)[9]; (a)[9] = c; \\\n" + " c = (a)[7]; (a)[7] = (a)[13]; (a)[13] = c; \\\n" + " c = (a)[11]; (a)[11] = (a)[14]; (a)[14] = c; \\\n" + "}\n" + "\n" + "#define fftKernel16(a,dir) \\\n" + "{ \\\n" + " const float w0 = 0x1.d906bcp-1f; \\\n" + " const float w1 = 0x1.87de2ap-2f; \\\n" + " const float w2 = 0x1.6a09e6p-1f; \\\n" + " fftKernel4s((a)[0], (a)[4], (a)[8], (a)[12], dir); \\\n" + " fftKernel4s((a)[1], (a)[5], (a)[9], (a)[13], dir); \\\n" + " fftKernel4s((a)[2], (a)[6], (a)[10], (a)[14], dir); \\\n" + " fftKernel4s((a)[3], (a)[7], (a)[11], (a)[15], dir); \\\n" + " (a)[5] = complexMul((a)[5], (float2)(w0, dir*w1)); \\\n" + " (a)[6] = complexMul((a)[6], (float2)(w2, dir*w2)); \\\n" + " (a)[7] = complexMul((a)[7], (float2)(w1, dir*w0)); \\\n" + " (a)[9] = complexMul((a)[9], (float2)(w2, dir*w2)); \\\n" + " (a)[10] = (float2)(dir)*(conjTransp((a)[10])); \\\n" + " (a)[11] = complexMul((a)[11], (float2)(-w2, dir*w2)); \\\n" + " (a)[13] = complexMul((a)[13], (float2)(w1, dir*w0)); \\\n" + " (a)[14] = complexMul((a)[14], (float2)(-w2, dir*w2)); \\\n" + " (a)[15] = complexMul((a)[15], (float2)(-w0, dir*-w1)); \\\n" + " fftKernel4((a), dir); \\\n" + " fftKernel4((a) + 4, dir); \\\n" + " fftKernel4((a) + 8, dir); \\\n" + " fftKernel4((a) + 12, dir); \\\n" + " bitreverse4x4((a)); \\\n" + "}\n" + "\n" + "#define bitreverse32(a) \\\n" + "{ \\\n" + " float2 c1, c2; \\\n" + " c1 = (a)[2]; (a)[2] = (a)[1]; c2 = (a)[4]; (a)[4] = c1; c1 = (a)[8]; (a)[8] = c2; c2 = (a)[16]; (a)[16] = c1; (a)[1] = c2; \\\n" + " c1 = (a)[6]; (a)[6] = (a)[3]; c2 = (a)[12]; (a)[12] = c1; c1 = (a)[24]; (a)[24] = c2; c2 = (a)[17]; (a)[17] = c1; (a)[3] = c2; \\\n" + " c1 = (a)[10]; (a)[10] = (a)[5]; c2 = (a)[20]; (a)[20] = c1; c1 = (a)[9]; (a)[9] = c2; c2 = (a)[18]; (a)[18] = c1; (a)[5] = c2; \\\n" + " c1 = (a)[14]; (a)[14] = (a)[7]; c2 = (a)[28]; (a)[28] = c1; c1 = (a)[25]; (a)[25] = c2; c2 = (a)[19]; (a)[19] = c1; (a)[7] = c2; \\\n" + " c1 = (a)[22]; (a)[22] = (a)[11]; c2 = (a)[13]; (a)[13] = c1; c1 = (a)[26]; (a)[26] = c2; c2 = (a)[21]; (a)[21] = c1; (a)[11] = c2; \\\n" + " c1 = (a)[30]; (a)[30] = (a)[15]; c2 = (a)[29]; (a)[29] = c1; c1 = (a)[27]; (a)[27] = c2; c2 = (a)[23]; (a)[23] = c1; (a)[15] = c2; \\\n" + "}\n" + "\n" + "#define fftKernel32(a,dir) \\\n" + "{ \\\n" + " fftKernel2S((a)[0], (a)[16], dir); \\\n" + " fftKernel2S((a)[1], (a)[17], dir); \\\n" + " fftKernel2S((a)[2], (a)[18], dir); \\\n" + " fftKernel2S((a)[3], (a)[19], dir); \\\n" + " fftKernel2S((a)[4], (a)[20], dir); \\\n" + " fftKernel2S((a)[5], (a)[21], dir); \\\n" + " fftKernel2S((a)[6], (a)[22], dir); \\\n" + " fftKernel2S((a)[7], (a)[23], dir); \\\n" + " fftKernel2S((a)[8], (a)[24], dir); \\\n" + " fftKernel2S((a)[9], (a)[25], dir); \\\n" + " fftKernel2S((a)[10], (a)[26], dir); \\\n" + " fftKernel2S((a)[11], (a)[27], dir); \\\n" + " fftKernel2S((a)[12], (a)[28], dir); \\\n" + " fftKernel2S((a)[13], (a)[29], dir); \\\n" + " fftKernel2S((a)[14], (a)[30], dir); \\\n" + " fftKernel2S((a)[15], (a)[31], dir); \\\n" + " (a)[17] = complexMul((a)[17], (float2)(0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" + " (a)[18] = complexMul((a)[18], (float2)(0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" + " (a)[19] = complexMul((a)[19], (float2)(0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" + " (a)[20] = complexMul((a)[20], (float2)(0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" + " (a)[21] = complexMul((a)[21], (float2)(0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" + " (a)[22] = complexMul((a)[22], (float2)(0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" + " (a)[23] = complexMul((a)[23], (float2)(0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" + " (a)[24] = complexMul((a)[24], (float2)(0x0p+0f, dir*0x1p+0f)); \\\n" + " (a)[25] = complexMul((a)[25], (float2)(-0x1.8f8b84p-3f, dir*0x1.f6297cp-1f)); \\\n" + " (a)[26] = complexMul((a)[26], (float2)(-0x1.87de2ap-2f, dir*0x1.d906bcp-1f)); \\\n" + " (a)[27] = complexMul((a)[27], (float2)(-0x1.1c73b4p-1f, dir*0x1.a9b662p-1f)); \\\n" + " (a)[28] = complexMul((a)[28], (float2)(-0x1.6a09e6p-1f, dir*0x1.6a09e6p-1f)); \\\n" + " (a)[29] = complexMul((a)[29], (float2)(-0x1.a9b662p-1f, dir*0x1.1c73b4p-1f)); \\\n" + " (a)[30] = complexMul((a)[30], (float2)(-0x1.d906bcp-1f, dir*0x1.87de2ap-2f)); \\\n" + " (a)[31] = complexMul((a)[31], (float2)(-0x1.f6297cp-1f, dir*0x1.8f8b84p-3f)); \\\n" + " fftKernel16((a), dir); \\\n" + " fftKernel16((a) + 16, dir); \\\n" + " bitreverse32((a)); \\\n" + "}\n\n" + ); + +static string twistKernelInterleaved = string( + "__kernel void \\\n" + "clFFT_1DTwistInterleaved(__global float2 *in, unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" + "{ \\\n" + " float2 a, w; \\\n" + " float ang; \\\n" + " unsigned int j; \\\n" + " unsigned int i = get_global_id(0); \\\n" + " unsigned int startIndex = i; \\\n" + " \\\n" + " if(i < numCols) \\\n" + " { \\\n" + " for(j = 0; j < numRowsToProcess; j++) \\\n" + " { \\\n" + " a = in[startIndex]; \\\n" + " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" + " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" + " a = complexMul(a, w); \\\n" + " in[startIndex] = a; \\\n" + " startIndex += numCols; \\\n" + " } \\\n" + " } \\\n" + "} \\\n" + ); + +static string twistKernelPlannar = string( + "__kernel void \\\n" + "clFFT_1DTwistSplit(__global float *in_real, __global float *in_imag , unsigned int startRow, unsigned int numCols, unsigned int N, unsigned int numRowsToProcess, int dir) \\\n" + "{ \\\n" + " float2 a, w; \\\n" + " float ang; \\\n" + " unsigned int j; \\\n" + " unsigned int i = get_global_id(0); \\\n" + " unsigned int startIndex = i; \\\n" + " \\\n" + " if(i < numCols) \\\n" + " { \\\n" + " for(j = 0; j < numRowsToProcess; j++) \\\n" + " { \\\n" + " a = (float2)(in_real[startIndex], in_imag[startIndex]); \\\n" + " ang = 2.0f * M_PI * dir * i * (startRow + j) / N; \\\n" + " w = (float2)(native_cos(ang), native_sin(ang)); \\\n" + " a = complexMul(a, w); \\\n" + " in_real[startIndex] = a.x; \\\n" + " in_imag[startIndex] = a.y; \\\n" + " startIndex += numCols; \\\n" + " } \\\n" + " } \\\n" + "} \\\n" + ); + + + +#endif diff --git a/src/algorithms/libs/fft_execute.cc b/src/algorithms/libs/fft_execute.cc new file mode 100644 index 000000000..6416e163e --- /dev/null +++ b/src/algorithms/libs/fft_execute.cc @@ -0,0 +1,405 @@ + +// +// File: fft_execute.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software.¬ +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include "fft_internal.h" +#include "clFFT.h" +#include +#include +#include + +#define max(a,b) (((a)>(b)) ? (a) : (b)) +#define min(a,b) (((a)<(b)) ? (a) : (b)) + +static cl_int +allocateTemporaryBufferInterleaved(cl_fft_plan *plan, cl_uint batchSize) +{ + cl_int err = CL_SUCCESS; + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + { + plan->last_batch_size = batchSize; + size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * 2 * sizeof(cl_float); + + if(plan->tempmemobj) + clReleaseMemObject(plan->tempmemobj); + + plan->tempmemobj = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); + } + return err; +} + +static cl_int +allocateTemporaryBufferPlannar(cl_fft_plan *plan, cl_uint batchSize) +{ + cl_int err = CL_SUCCESS; + cl_int terr; + if(plan->temp_buffer_needed && plan->last_batch_size != batchSize) + { + plan->last_batch_size = batchSize; + size_t tmpLength = plan->n.x * plan->n.y * plan->n.z * batchSize * sizeof(cl_float); + + if(plan->tempmemobj_real) + clReleaseMemObject(plan->tempmemobj_real); + + if(plan->tempmemobj_imag) + clReleaseMemObject(plan->tempmemobj_imag); + + plan->tempmemobj_real = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &err); + plan->tempmemobj_imag = clCreateBuffer(plan->context, CL_MEM_READ_WRITE, tmpLength, NULL, &terr); + err |= terr; + } + return err; +} + +void +getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems) +{ + *lWorkItems = kernelInfo->num_workitems_per_workgroup; + int numWorkGroups = kernelInfo->num_workgroups; + int numXFormsPerWG = kernelInfo->num_xforms_per_workgroup; + + switch(kernelInfo->dir) + { + case cl_fft_kernel_x: + *batchSize *= (plan->n.y * plan->n.z); + numWorkGroups = (*batchSize % numXFormsPerWG) ? (*batchSize/numXFormsPerWG + 1) : (*batchSize/numXFormsPerWG); + numWorkGroups *= kernelInfo->num_workgroups; + break; + case cl_fft_kernel_y: + *batchSize *= plan->n.z; + numWorkGroups *= *batchSize; + break; + case cl_fft_kernel_z: + numWorkGroups *= *batchSize; + break; + } + + *gWorkItems = numWorkGroups * *lWorkItems; +} + +cl_int +clFFT_ExecuteInterleaved( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in, cl_mem data_out, + cl_int num_events, cl_event *event_list, cl_event *event ) +{ + int s; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + if(plan->format != clFFT_InterleavedComplexFormat) + return CL_INVALID_VALUE; + + cl_int err; + size_t gWorkItems, lWorkItems; + int inPlaceDone; + + cl_int isInPlace = data_in == data_out ? 1 : 0; + + if((err = allocateTemporaryBufferInterleaved(plan, batchSize)) != CL_SUCCESS) + return err; + + cl_mem memObj[3]; + memObj[0] = data_in; + memObj[1] = data_out; + memObj[2] = plan->tempmemobj; + cl_fft_kernel_info *kernelInfo = plan->kernel_info; + int numKernels = plan->num_kernels; + + int numKernelsOdd = numKernels & 1; + int currRead = 0; + int currWrite = 1; + + // at least one external dram shuffle (transpose) required + if(plan->temp_buffer_needed) + { + // in-place transform + if(isInPlace) + { + inPlaceDone = 0; + currRead = 1; + currWrite = 2; + } + else + { + currWrite = (numKernels & 1) ? 1 : 2; + } + + while(kernelInfo) + { + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + { + currWrite = currRead; + inPlaceDone = 1; + } + + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = (currWrite == 1) ? 1 : 2; + currWrite = (currWrite == 1) ? 2 : 1; + + kernelInfo = kernelInfo->next; + } + } + // no dram shuffle (transpose required) transform + // all kernels can execute in-place. + else { + + while(kernelInfo) + { + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = 1; + currWrite = 1; + + kernelInfo = kernelInfo->next; + } + } + + return err; +} + +cl_int +clFFT_ExecutePlannar( cl_command_queue queue, clFFT_Plan Plan, cl_int batchSize, clFFT_Direction dir, + cl_mem data_in_real, cl_mem data_in_imag, cl_mem data_out_real, cl_mem data_out_imag, + cl_int num_events, cl_event *event_list, cl_event *event) +{ + int s; + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + if(plan->format != clFFT_SplitComplexFormat) + return CL_INVALID_VALUE; + + cl_int err; + size_t gWorkItems, lWorkItems; + int inPlaceDone; + + cl_int isInPlace = ((data_in_real == data_out_real) && (data_in_imag == data_out_imag)) ? 1 : 0; + + if((err = allocateTemporaryBufferPlannar(plan, batchSize)) != CL_SUCCESS) + return err; + + cl_mem memObj_real[3]; + cl_mem memObj_imag[3]; + memObj_real[0] = data_in_real; + memObj_real[1] = data_out_real; + memObj_real[2] = plan->tempmemobj_real; + memObj_imag[0] = data_in_imag; + memObj_imag[1] = data_out_imag; + memObj_imag[2] = plan->tempmemobj_imag; + + cl_fft_kernel_info *kernelInfo = plan->kernel_info; + int numKernels = plan->num_kernels; + + int numKernelsOdd = numKernels & 1; + int currRead = 0; + int currWrite = 1; + + // at least one external dram shuffle (transpose) required + if(plan->temp_buffer_needed) + { + // in-place transform + if(isInPlace) + { + inPlaceDone = 0; + currRead = 1; + currWrite = 2; + } + else + { + currWrite = (numKernels & 1) ? 1 : 2; + } + + while(kernelInfo) + { + if( isInPlace && numKernelsOdd && !inPlaceDone && kernelInfo->in_place_possible) + { + currWrite = currRead; + inPlaceDone = 1; + } + + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = (currWrite == 1) ? 1 : 2; + currWrite = (currWrite == 1) ? 2 : 1; + + kernelInfo = kernelInfo->next; + } + } + // no dram shuffle (transpose required) transform + else { + + while(kernelInfo) + { + s = batchSize; + getKernelWorkDimensions(plan, kernelInfo, &s, &gWorkItems, &lWorkItems); + err |= clSetKernelArg(kernelInfo->kernel, 0, sizeof(cl_mem), &memObj_real[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 1, sizeof(cl_mem), &memObj_imag[currRead]); + err |= clSetKernelArg(kernelInfo->kernel, 2, sizeof(cl_mem), &memObj_real[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 3, sizeof(cl_mem), &memObj_imag[currWrite]); + err |= clSetKernelArg(kernelInfo->kernel, 4, sizeof(cl_int), &dir); + err |= clSetKernelArg(kernelInfo->kernel, 5, sizeof(cl_int), &s); + + err |= clEnqueueNDRangeKernel(queue, kernelInfo->kernel, 1, NULL, &gWorkItems, &lWorkItems, 0, NULL, NULL); + if(err) + return err; + + currRead = 1; + currWrite = 1; + + kernelInfo = kernelInfo->next; + } + } + + return err; +} + +cl_int +clFFT_1DTwistInterleaved(clFFT_Plan Plan, cl_command_queue queue, cl_mem array, + unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir) +{ + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + unsigned int N = numRows*numCols; + unsigned int nCols = numCols; + unsigned int sRow = startRow; + unsigned int rToProcess = rowsToProcess; + int d = dir; + int err = 0; + + cl_device_id device_id; + err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); + if(err) + return err; + + size_t gSize; + err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); + if(err) + return err; + + gSize = min(128, gSize); + size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; + size_t numLocalThreads[1] = { gSize }; + + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array); + err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(unsigned int), &sRow); + err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &nCols); + err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &N); + err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &rToProcess); + err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(int), &d); + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; +} + +cl_int +clFFT_1DTwistPlannar(clFFT_Plan Plan, cl_command_queue queue, cl_mem array_real, cl_mem array_imag, + unsigned numRows, unsigned numCols, unsigned startRow, unsigned rowsToProcess, clFFT_Direction dir) +{ + cl_fft_plan *plan = (cl_fft_plan *) Plan; + + unsigned int N = numRows*numCols; + unsigned int nCols = numCols; + unsigned int sRow = startRow; + unsigned int rToProcess = rowsToProcess; + int d = dir; + int err = 0; + + cl_device_id device_id; + err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &device_id, NULL); + if(err) + return err; + + size_t gSize; + err = clGetKernelWorkGroupInfo(plan->twist_kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &gSize, NULL); + if(err) + return err; + + gSize = min(128, gSize); + size_t numGlobalThreads[1] = { max(numCols / gSize, 1)*gSize }; + size_t numLocalThreads[1] = { gSize }; + + err |= clSetKernelArg(plan->twist_kernel, 0, sizeof(cl_mem), &array_real); + err |= clSetKernelArg(plan->twist_kernel, 1, sizeof(cl_mem), &array_imag); + err |= clSetKernelArg(plan->twist_kernel, 2, sizeof(unsigned int), &sRow); + err |= clSetKernelArg(plan->twist_kernel, 3, sizeof(unsigned int), &nCols); + err |= clSetKernelArg(plan->twist_kernel, 4, sizeof(unsigned int), &N); + err |= clSetKernelArg(plan->twist_kernel, 5, sizeof(unsigned int), &rToProcess); + err |= clSetKernelArg(plan->twist_kernel, 6, sizeof(int), &d); + + err |= clEnqueueNDRangeKernel(queue, plan->twist_kernel, 1, NULL, numGlobalThreads, numLocalThreads, 0, NULL, NULL); + + return err; +} + diff --git a/src/algorithms/libs/fft_internal.h b/src/algorithms/libs/fft_internal.h new file mode 100644 index 000000000..5b797a3a0 --- /dev/null +++ b/src/algorithms/libs/fft_internal.h @@ -0,0 +1,163 @@ + +// +// File: fft_internal.h +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#ifndef __CLFFT_INTERNAL_H +#define __CLFFT_INTERNAL_H + +#include "clFFT.h" +#include +#include +#include + +using namespace std; + +typedef enum kernel_dir_t +{ + cl_fft_kernel_x, + cl_fft_kernel_y, + cl_fft_kernel_z +}cl_fft_kernel_dir; + +typedef struct kernel_info_t +{ + cl_kernel kernel; + char *kernel_name; + unsigned lmem_size; + unsigned num_workgroups; + unsigned num_xforms_per_workgroup; + unsigned num_workitems_per_workgroup; + cl_fft_kernel_dir dir; + int in_place_possible; + kernel_info_t *next; +}cl_fft_kernel_info; + +typedef struct +{ + // context in which fft resources are created and kernels are executed + cl_context context; + + // size of signal + clFFT_Dim3 n; + + // dimension of transform ... must be either 1D, 2D or 3D + clFFT_Dimension dim; + + // data format ... must be either interleaved or plannar + clFFT_DataFormat format; + + // string containing kernel source. Generated at runtime based on + // n, dim, format and other parameters + string *kernel_string; + + // CL program containing source and kernel this particular + // n, dim, data format + cl_program program; + + // linked list of kernels which needs to be executed for this fft + cl_fft_kernel_info *kernel_info; + + // number of kernels + int num_kernels; + + // twist kernel for virtualizing fft of very large sizes that do not + // fit in GPU global memory + cl_kernel twist_kernel; + + // flag indicating if temporary intermediate buffer is needed or not. + // this depends on fft kernels being executed and if transform is + // in-place or out-of-place. e.g. Local memory fft (say 1D 1024 ... + // one that does not require global transpose do not need temporary buffer) + // 2D 1024x1024 out-of-place fft however do require intermediate buffer. + // If temp buffer is needed, its allocation is lazy i.e. its not allocated + // until its needed + cl_int temp_buffer_needed; + + // Batch size is runtime parameter and size of temporary buffer (if needed) + // depends on batch size. Allocation of temporary buffer is lazy i.e. its + // only created when needed. Once its created at first call of clFFT_Executexxx + // it is not allocated next time if next time clFFT_Executexxx is called with + // batch size different than the first call. last_batch_size caches the last + // batch size with which this plan is used so that we dont keep allocating/deallocating + // temp buffer if same batch size is used again and again. + unsigned last_batch_size; + + // temporary buffer for interleaved plan + cl_mem tempmemobj; + + // temporary buffer for planner plan. Only one of tempmemobj or + // (tempmemobj_real, tempmemobj_imag) pair is valid (allocated) depending + // data format of plan (plannar or interleaved) + cl_mem tempmemobj_real, tempmemobj_imag; + + // Maximum size of signal for which local memory transposed based + // fft is sufficient i.e. no global mem transpose (communication) + // is needed + unsigned max_localmem_fft_size; + + // Maximum work items per work group allowed. This, along with max_radix below controls + // maximum local memory being used by fft kernels of this plan. Set to 256 by default + unsigned max_work_item_per_workgroup; + + // Maximum base radix for local memory fft ... this controls the maximum register + // space used by work items. Currently defaults to 16 + unsigned max_radix; + + // Device depended parameter that tells how many work-items need to be read consecutive + // values to make sure global memory access by work-items of a work-group result in + // coalesced memory access to utilize full bandwidth e.g. on NVidia tesla, this is 16 + unsigned min_mem_coalesce_width; + + // Number of local memory banks. This is used to geneate kernel with local memory + // transposes with appropriate padding to avoid bank conflicts to local memory + // e.g. on NVidia it is 16. + unsigned num_local_mem_banks; +}cl_fft_plan; + +void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir); + +#endif diff --git a/src/algorithms/libs/fft_kernelstring.cc b/src/algorithms/libs/fft_kernelstring.cc new file mode 100644 index 000000000..7847ce959 --- /dev/null +++ b/src/algorithms/libs/fft_kernelstring.cc @@ -0,0 +1,1257 @@ + +// +// File: fft_kernelstring.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include +#include +#include +#include +#include +#include +#include +#include +#include "fft_internal.h" +#include "clFFT.h" + +using namespace std; + +#define max(A,B) ((A) > (B) ? (A) : (B)) +#define min(A,B) ((A) < (B) ? (A) : (B)) + +static string +num2str(int num) +{ + char temp[200]; + sprintf(temp, "%d", num); + return string(temp); +} + +// For any n, this function decomposes n into factors for loacal memory tranpose +// based fft. Factors (radices) are sorted such that the first one (radixArray[0]) +// is the largest. This base radix determines the number of registers used by each +// work item and product of remaining radices determine the size of work group needed. +// To make things concrete with and example, suppose n = 1024. It is decomposed into +// 1024 = 16 x 16 x 4. Hence kernel uses float2 a[16], for local in-register fft and +// needs 16 x 4 = 64 work items per work group. So kernel first performance 64 length +// 16 ffts (64 work items working in parallel) following by transpose using local +// memory followed by again 64 length 16 ffts followed by transpose using local memory +// followed by 256 length 4 ffts. For the last step since with size of work group is +// 64 and each work item can array for 16 values, 64 work items can compute 256 length +// 4 ffts by each work item computing 4 length 4 ffts. +// Similarly for n = 2048 = 8 x 8 x 8 x 4, each work group has 8 x 8 x 4 = 256 work +// iterms which each computes 256 (in-parallel) length 8 ffts in-register, followed +// by transpose using local memory, followed by 256 length 8 in-register ffts, followed +// by transpose using local memory, followed by 256 length 8 in-register ffts, followed +// by transpose using local memory, followed by 512 length 4 in-register ffts. Again, +// for the last step, each work item computes two length 4 in-register ffts and thus +// 256 work items are needed to compute all 512 ffts. +// For n = 32 = 8 x 4, 4 work items first compute 4 in-register +// lenth 8 ffts, followed by transpose using local memory followed by 8 in-register +// length 4 ffts, where each work item computes two length 4 ffts thus 4 work items +// can compute 8 length 4 ffts. However if work group size of say 64 is choosen, +// each work group can compute 64/ 4 = 16 size 32 ffts (batched transform). +// Users can play with these parameters to figure what gives best performance on +// their particular device i.e. some device have less register space thus using +// smaller base radix can avoid spilling ... some has small local memory thus +// using smaller work group size may be required etc + +static void +getRadixArray(unsigned int n, unsigned int *radixArray, unsigned int *numRadices, unsigned int maxRadix) +{ + if(maxRadix > 1) + { + maxRadix = min(n, maxRadix); + unsigned int cnt = 0; + while(n > maxRadix) + { + radixArray[cnt++] = maxRadix; + n /= maxRadix; + } + radixArray[cnt++] = n; + *numRadices = cnt; + return; + } + + switch(n) + { + case 2: + *numRadices = 1; + radixArray[0] = 2; + break; + + case 4: + *numRadices = 1; + radixArray[0] = 4; + break; + + case 8: + *numRadices = 1; + radixArray[0] = 8; + break; + + case 16: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 2; + break; + + case 32: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 4; + break; + + case 64: + *numRadices = 2; + radixArray[0] = 8; radixArray[1] = 8; + break; + + case 128: + *numRadices = 3; + radixArray[0] = 8; radixArray[1] = 4; radixArray[2] = 4; + break; + + case 256: + *numRadices = 4; + radixArray[0] = 4; radixArray[1] = 4; radixArray[2] = 4; radixArray[3] = 4; + break; + + case 512: + *numRadices = 3; + radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; + break; + + case 1024: + *numRadices = 3; + radixArray[0] = 16; radixArray[1] = 16; radixArray[2] = 4; + break; + case 2048: + *numRadices = 4; + radixArray[0] = 8; radixArray[1] = 8; radixArray[2] = 8; radixArray[3] = 4; + break; + default: + *numRadices = 0; + return; + } +} + +static void +insertHeader(string &kernelString, string &kernelName, clFFT_DataFormat dataFormat) +{ + if(dataFormat == clFFT_SplitComplexFormat) + kernelString += string("__kernel void ") + kernelName + string("(__global float *in_real, __global float *in_imag, __global float *out_real, __global float *out_imag, int dir, int S)\n"); + else + kernelString += string("__kernel void ") + kernelName + string("(__global float2 *in, __global float2 *out, int dir, int S)\n"); +} + +static void +insertVariables(string &kStream, int maxRadix) +{ + kStream += string(" int i, j, r, indexIn, indexOut, index, tid, bNum, xNum, k, l;\n"); + kStream += string(" int s, ii, jj, offset;\n"); + kStream += string(" float2 w;\n"); + kStream += string(" float ang, angf, ang1;\n"); + kStream += string(" __local float *lMemStore, *lMemLoad;\n"); + kStream += string(" float2 a[") + num2str(maxRadix) + string("];\n"); + kStream += string(" int lId = get_local_id( 0 );\n"); + kStream += string(" int groupId = get_group_id( 0 );\n"); +} + +static void +formattedLoad(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) +{ + if(dataFormat == clFFT_InterleavedComplexFormat) + kernelString += string(" a[") + num2str(aIndex) + string("] = in[") + num2str(gIndex) + string("];\n"); + else + { + kernelString += string(" a[") + num2str(aIndex) + string("].x = in_real[") + num2str(gIndex) + string("];\n"); + kernelString += string(" a[") + num2str(aIndex) + string("].y = in_imag[") + num2str(gIndex) + string("];\n"); + } +} + +static void +formattedStore(string &kernelString, int aIndex, int gIndex, clFFT_DataFormat dataFormat) +{ + if(dataFormat == clFFT_InterleavedComplexFormat) + kernelString += string(" out[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("];\n"); + else + { + kernelString += string(" out_real[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].x;\n"); + kernelString += string(" out_imag[") + num2str(gIndex) + string("] = a[") + num2str(aIndex) + string("].y;\n"); + } +} + +static int +insertGlobalLoadsAndTranspose(string &kernelString, int N, int numWorkItemsPerXForm, int numXFormsPerWG, int R0, int mem_coalesce_width, clFFT_DataFormat dataFormat) +{ + int log2NumWorkItemsPerXForm = (int) log2(numWorkItemsPerXForm); + int groupSize = numWorkItemsPerXForm * numXFormsPerWG; + int i, j; + int lMemSize = 0; + + if(numXFormsPerWG > 1) + kernelString += string(" s = S & ") + num2str(numXFormsPerWG - 1) + string(";\n"); + + if(numWorkItemsPerXForm >= mem_coalesce_width) + { + if(numXFormsPerWG > 1) + { + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm-1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); + kernelString += string(" offset = mad24( mad24(groupId, ") + num2str(numXFormsPerWG) + string(", jj), ") + num2str(N) + string(", ii );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + for(i = 0; i < R0; i++) + formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); + kernelString += string(" }\n"); + } + else + { + kernelString += string(" ii = lId;\n"); + kernelString += string(" jj = 0;\n"); + kernelString += string(" offset = mad24(groupId, ") + num2str(N) + string(", ii);\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + for(i = 0; i < R0; i++) + formattedLoad(kernelString, i, i*numWorkItemsPerXForm, dataFormat); + } + } + else if( N >= mem_coalesce_width ) + { + int numInnerIter = N / mem_coalesce_width; + int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); + + kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + kernelString += string(" offset = mad24( groupId, ") + num2str(numXFormsPerWG) + string(", jj);\n"); + kernelString += string(" offset = mad24( offset, ") + num2str(N) + string(", ii );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + kernelString += string(" if( jj < s ) {\n"); + for(j = 0; j < numInnerIter; j++ ) + formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); + kernelString += string(" }\n"); + if(i != numOuterIter - 1) + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + } + kernelString += string("}\n "); + kernelString += string("else {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + for(j = 0; j < numInnerIter; j++ ) + formattedLoad(kernelString, i * numInnerIter + j, j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * N, dataFormat); + } + kernelString += string("}\n"); + + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii);\n"); + + for( i = 0; i < numOuterIter; i++ ) + { + for( j = 0; j < numInnerIter; j++ ) + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + num2str(i * numInnerIter + j) + string("].x;\n"); + } + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + { + for( j = 0; j < numInnerIter; j++ ) + { + kernelString += string(" lMemStore[") + num2str(j * mem_coalesce_width + i * ( groupSize / mem_coalesce_width ) * (N + numWorkItemsPerXForm )) + string("] = a[") + + num2str(i * numInnerIter + j) + string("].y;\n"); + } + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + else + { + kernelString += string(" offset = mad24( groupId, ") + num2str(N * numXFormsPerWG) + string(", lId );\n"); + if(dataFormat == clFFT_InterleavedComplexFormat) + { + kernelString += string(" in += offset;\n"); + kernelString += string(" out += offset;\n"); + } + else + { + kernelString += string(" in_real += offset;\n"); + kernelString += string(" in_imag += offset;\n"); + kernelString += string(" out_real += offset;\n"); + kernelString += string(" out_imag += offset;\n"); + } + + kernelString += string(" ii = lId & ") + num2str(N-1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int)log2(N)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for( i = 0; i < R0; i++ ) + { + kernelString += string(" if(jj < s )\n"); + formattedLoad(kernelString, i, i*groupSize, dataFormat); + if(i != R0 - 1) + kernelString += string(" jj += ") + num2str(groupSize / N) + string(";\n"); + } + kernelString += string("}\n"); + kernelString += string("else {\n"); + for( i = 0; i < R0; i++ ) + { + formattedLoad(kernelString, i, i*groupSize, dataFormat); + } + kernelString += string("}\n"); + + if(numWorkItemsPerXForm > 1) + { + kernelString += string(" ii = lId & ") + num2str(numWorkItemsPerXForm - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str(log2NumWorkItemsPerXForm) + string(";\n"); + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + } + else + { + kernelString += string(" ii = 0;\n"); + kernelString += string(" jj = lId;\n"); + kernelString += string(" lMemLoad = sMem + mul24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(");\n"); + } + + + for( i = 0; i < R0; i++ ) + kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].x;\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" lMemStore[") + num2str(i * ( groupSize / N ) * ( N + numWorkItemsPerXForm )) + string("] = a[") + num2str(i) + string("].y;\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < R0; i++ ) + kernelString += string(" a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i * numWorkItemsPerXForm) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + + return lMemSize; +} + +static int +insertGlobalStoresAndTranspose(string &kernelString, int N, int maxRadix, int Nr, int numWorkItemsPerXForm, int numXFormsPerWG, int mem_coalesce_width, clFFT_DataFormat dataFormat) +{ + int groupSize = numWorkItemsPerXForm * numXFormsPerWG; + int i, j, k, ind; + int lMemSize = 0; + int numIter = maxRadix / Nr; + string indent = string(""); + + if( numWorkItemsPerXForm >= mem_coalesce_width ) + { + if(numXFormsPerWG > 1) + { + kernelString += string(" if( !s || (groupId < get_num_groups(0)-1) || (jj < s) ) {\n"); + indent = string(" "); + } + for(i = 0; i < maxRadix; i++) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + formattedStore(kernelString, ind, i*numWorkItemsPerXForm, dataFormat); + } + if(numXFormsPerWG > 1) + kernelString += string(" }\n"); + } + else if( N >= mem_coalesce_width ) + { + int numInnerIter = N / mem_coalesce_width; + int numOuterIter = numXFormsPerWG / ( groupSize / mem_coalesce_width ); + + kernelString += string(" lMemLoad = sMem + mad24( jj, ") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + kernelString += string(" ii = lId & ") + num2str(mem_coalesce_width - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int)log2(mem_coalesce_width)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + for( j = 0; j < numInnerIter; j++ ) + kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].x = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < numOuterIter; i++ ) + for( j = 0; j < numInnerIter; j++ ) + kernelString += string(" a[") + num2str(i*numInnerIter + j) + string("].y = lMemStore[") + num2str(j*mem_coalesce_width + i*( groupSize / mem_coalesce_width )*(N + numWorkItemsPerXForm)) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + kernelString += string(" if( jj < s ) {\n"); + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + kernelString += string(" }\n"); + if(i != numOuterIter - 1) + kernelString += string(" jj += ") + num2str(groupSize / mem_coalesce_width) + string(";\n"); + } + kernelString += string("}\n"); + kernelString += string("else {\n"); + for(i = 0; i < numOuterIter; i++ ) + { + for(j = 0; j < numInnerIter; j++ ) + formattedStore(kernelString, i*numInnerIter + j, j*mem_coalesce_width + i*(groupSize/mem_coalesce_width)*N, dataFormat); + } + kernelString += string("}\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + else + { + kernelString += string(" lMemLoad = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + kernelString += string(" ii = lId & ") + num2str(N - 1) + string(";\n"); + kernelString += string(" jj = lId >> ") + num2str((int) log2(N)) + string(";\n"); + kernelString += string(" lMemStore = sMem + mad24( jj,") + num2str(N + numWorkItemsPerXForm) + string(", ii );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].x;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + kernelString += string(" a[") + num2str(i) + string("].x = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + { + j = i % numIter; + k = i / numIter; + ind = j * Nr + k; + kernelString += string(" lMemLoad[") + num2str(i*numWorkItemsPerXForm) + string("] = a[") + num2str(ind) + string("].y;\n"); + } + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + for( i = 0; i < maxRadix; i++ ) + kernelString += string(" a[") + num2str(i) + string("].y = lMemStore[") + num2str(i*( groupSize / N )*( N + numWorkItemsPerXForm )) + string("];\n"); + kernelString += string(" barrier( CLK_LOCAL_MEM_FENCE );\n"); + + kernelString += string("if((groupId == get_num_groups(0)-1) && s) {\n"); + for( i = 0; i < maxRadix; i++ ) + { + kernelString += string(" if(jj < s ) {\n"); + formattedStore(kernelString, i, i*groupSize, dataFormat); + kernelString += string(" }\n"); + if( i != maxRadix - 1) + kernelString += string(" jj +=") + num2str(groupSize / N) + string(";\n"); + } + kernelString += string("}\n"); + kernelString += string("else {\n"); + for( i = 0; i < maxRadix; i++ ) + { + formattedStore(kernelString, i, i*groupSize, dataFormat); + } + kernelString += string("}\n"); + + lMemSize = (N + numWorkItemsPerXForm) * numXFormsPerWG; + } + + return lMemSize; +} + +static void +insertfftKernel(string &kernelString, int Nr, int numIter) +{ + int i; + for(i = 0; i < numIter; i++) + { + kernelString += string(" fftKernel") + num2str(Nr) + string("(a+") + num2str(i*Nr) + string(", dir);\n"); + } +} + +static void +insertTwiddleKernel(string &kernelString, int Nr, int numIter, int Nprev, int len, int numWorkItemsPerXForm) +{ + int z, k; + int logNPrev = (int)log2(Nprev); + + for(z = 0; z < numIter; z++) + { + if(z == 0) + { + if(Nprev > 1) + kernelString += string(" angf = (float) (ii >> ") + num2str(logNPrev) + string(");\n"); + else + kernelString += string(" angf = (float) ii;\n"); + } + else + { + if(Nprev > 1) + kernelString += string(" angf = (float) ((") + num2str(z*numWorkItemsPerXForm) + string(" + ii) >>") + num2str(logNPrev) + string(");\n"); + else + kernelString += string(" angf = (float) (") + num2str(z*numWorkItemsPerXForm) + string(" + ii);\n"); + } + + for(k = 1; k < Nr; k++) { + int ind = z*Nr + k; + //float fac = (float) (2.0 * M_PI * (double) k / (double) len); + kernelString += string(" ang = dir * ( 2.0f * M_PI * ") + num2str(k) + string(".0f / ") + num2str(len) + string(".0f )") + string(" * angf;\n"); + kernelString += string(" w = (float2)(native_cos(ang), native_sin(ang));\n"); + kernelString += string(" a[") + num2str(ind) + string("] = complexMul(a[") + num2str(ind) + string("], w);\n"); + } + } +} + +static int +getPadding(int numWorkItemsPerXForm, int Nprev, int numWorkItemsReq, int numXFormsPerWG, int Nr, int numBanks, int *offset, int *midPad) +{ + if((numWorkItemsPerXForm <= Nprev) || (Nprev >= numBanks)) + *offset = 0; + else { + int numRowsReq = ((numWorkItemsPerXForm < numBanks) ? numWorkItemsPerXForm : numBanks) / Nprev; + int numColsReq = 1; + if(numRowsReq > Nr) + numColsReq = numRowsReq / Nr; + numColsReq = Nprev * numColsReq; + *offset = numColsReq; + } + + if(numWorkItemsPerXForm >= numBanks || numXFormsPerWG == 1) + *midPad = 0; + else { + int bankNum = ( (numWorkItemsReq + *offset) * Nr ) & (numBanks - 1); + if( bankNum >= numWorkItemsPerXForm ) + *midPad = 0; + else + *midPad = numWorkItemsPerXForm - bankNum; + } + + int lMemSize = ( numWorkItemsReq + *offset) * Nr * numXFormsPerWG + *midPad * (numXFormsPerWG - 1); + return lMemSize; +} + + +static void +insertLocalStores(string &kernelString, int numIter, int Nr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) +{ + int z, k; + + for(z = 0; z < numIter; z++) { + for(k = 0; k < Nr; k++) { + int index = k*(numWorkItemsReq + offset) + z*numWorkItemsPerXForm; + kernelString += string(" lMemStore[") + num2str(index) + string("] = a[") + num2str(z*Nr + k) + string("].") + comp + string(";\n"); + } + } + kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); +} + +static void +insertLocalLoads(string &kernelString, int n, int Nr, int Nrn, int Nprev, int Ncurr, int numWorkItemsPerXForm, int numWorkItemsReq, int offset, string &comp) +{ + int numWorkItemsReqN = n / Nrn; + int interBlockHNum = max( Nprev / numWorkItemsPerXForm, 1 ); + int interBlockHStride = numWorkItemsPerXForm; + int vertWidth = max(numWorkItemsPerXForm / Nprev, 1); + vertWidth = min( vertWidth, Nr); + int vertNum = Nr / vertWidth; + int vertStride = ( n / Nr + offset ) * vertWidth; + int iter = max( numWorkItemsReqN / numWorkItemsPerXForm, 1); + int intraBlockHStride = (numWorkItemsPerXForm / (Nprev*Nr)) > 1 ? (numWorkItemsPerXForm / (Nprev*Nr)) : 1; + intraBlockHStride *= Nprev; + + int stride = numWorkItemsReq / Nrn; + int i; + for(i = 0; i < iter; i++) { + int ii = i / (interBlockHNum * vertNum); + int zz = i % (interBlockHNum * vertNum); + int jj = zz % interBlockHNum; + int kk = zz / interBlockHNum; + int z; + for(z = 0; z < Nrn; z++) { + int st = kk * vertStride + jj * interBlockHStride + ii * intraBlockHStride + z * stride; + kernelString += string(" a[") + num2str(i*Nrn + z) + string("].") + comp + string(" = lMemLoad[") + num2str(st) + string("];\n"); + } + } + kernelString += string(" barrier(CLK_LOCAL_MEM_FENCE);\n"); +} + +static void +insertLocalLoadIndexArithmatic(string &kernelString, int Nprev, int Nr, int numWorkItemsReq, int numWorkItemsPerXForm, int numXFormsPerWG, int offset, int midPad) +{ + int Ncurr = Nprev * Nr; + int logNcurr = (int)log2(Ncurr); + int logNprev = (int)log2(Nprev); + int incr = (numWorkItemsReq + offset) * Nr + midPad; + + if(Ncurr < numWorkItemsPerXForm) + { + if(Nprev == 1) + kernelString += string(" j = ii & ") + num2str(Ncurr - 1) + string(";\n"); + else + kernelString += string(" j = (ii & ") + num2str(Ncurr - 1) + string(") >> ") + num2str(logNprev) + string(";\n"); + + if(Nprev == 1) + kernelString += string(" i = ii >> ") + num2str(logNcurr) + string(";\n"); + else + kernelString += string(" i = mad24(ii >> ") + num2str(logNcurr) + string(", ") + num2str(Nprev) + string(", ii & ") + num2str(Nprev - 1) + string(");\n"); + } + else + { + if(Nprev == 1) + kernelString += string(" j = ii;\n"); + else + kernelString += string(" j = ii >> ") + num2str(logNprev) + string(";\n"); + if(Nprev == 1) + kernelString += string(" i = 0;\n"); + else + kernelString += string(" i = ii & ") + num2str(Nprev - 1) + string(";\n"); + } + + if(numXFormsPerWG > 1) + kernelString += string(" i = mad24(jj, ") + num2str(incr) + string(", i);\n"); + + kernelString += string(" lMemLoad = sMem + mad24(j, ") + num2str(numWorkItemsReq + offset) + string(", i);\n"); +} + +static void +insertLocalStoreIndexArithmatic(string &kernelString, int numWorkItemsReq, int numXFormsPerWG, int Nr, int offset, int midPad) +{ + if(numXFormsPerWG == 1) { + kernelString += string(" lMemStore = sMem + ii;\n"); + } + else { + kernelString += string(" lMemStore = sMem + mad24(jj, ") + num2str((numWorkItemsReq + offset)*Nr + midPad) + string(", ii);\n"); + } +} + + +static void +createLocalMemfftKernelString(cl_fft_plan *plan) +{ + unsigned int radixArray[10]; + unsigned int numRadix; + + unsigned int n = plan->n.x; + + assert(n <= plan->max_work_item_per_workgroup * plan->max_radix && "signal lenght too big for local mem fft\n"); + + getRadixArray(n, radixArray, &numRadix, 0); + assert(numRadix > 0 && "no radix array supplied\n"); + + if(n/radixArray[0] > plan->max_work_item_per_workgroup) + getRadixArray(n, radixArray, &numRadix, plan->max_radix); + + assert(radixArray[0] <= plan->max_radix && "max radix choosen is greater than allowed\n"); + assert(n/radixArray[0] <= plan->max_work_item_per_workgroup && "required work items per xform greater than maximum work items allowed per work group for local mem fft\n"); + + unsigned int tmpLen = 1; + unsigned int i; + for(i = 0; i < numRadix; i++) + { + assert( radixArray[i] && !( (radixArray[i] - 1) & radixArray[i] ) ); + tmpLen *= radixArray[i]; + } + assert(tmpLen == n && "product of radices choosen doesnt match the length of signal\n"); + + int offset, midPad; + string localString(""), kernelName(""); + + clFFT_DataFormat dataFormat = plan->format; + string *kernelString = plan->kernel_string; + + + cl_fft_kernel_info **kInfo = &plan->kernel_info; + int kCount = 0; + + while(*kInfo) + { + kInfo = &(*kInfo)->next; + kCount++; + } + + kernelName = string("fft") + num2str(kCount); + + *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); + (*kInfo)->kernel = 0; + (*kInfo)->lmem_size = 0; + (*kInfo)->num_workgroups = 0; + (*kInfo)->num_workitems_per_workgroup = 0; + (*kInfo)->dir = cl_fft_kernel_x; + (*kInfo)->in_place_possible = 1; + (*kInfo)->next = NULL; + (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); + strcpy((*kInfo)->kernel_name, kernelName.c_str()); + + unsigned int numWorkItemsPerXForm = n / radixArray[0]; + unsigned int numWorkItemsPerWG = numWorkItemsPerXForm <= 64 ? 64 : numWorkItemsPerXForm; + assert(numWorkItemsPerWG <= plan->max_work_item_per_workgroup); + int numXFormsPerWG = numWorkItemsPerWG / numWorkItemsPerXForm; + (*kInfo)->num_workgroups = 1; + (*kInfo)->num_xforms_per_workgroup = numXFormsPerWG; + (*kInfo)->num_workitems_per_workgroup = numWorkItemsPerWG; + + unsigned int *N = radixArray; + unsigned int maxRadix = N[0]; + unsigned int lMemSize = 0; + + insertVariables(localString, maxRadix); + + lMemSize = insertGlobalLoadsAndTranspose(localString, n, numWorkItemsPerXForm, numXFormsPerWG, maxRadix, plan->min_mem_coalesce_width, dataFormat); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + + string xcomp = string("x"); + string ycomp = string("y"); + + unsigned int Nprev = 1; + unsigned int len = n; + unsigned int r; + for(r = 0; r < numRadix; r++) + { + int numIter = N[0] / N[r]; + int numWorkItemsReq = n / N[r]; + int Ncurr = Nprev * N[r]; + insertfftKernel(localString, N[r], numIter); + + if(r < (numRadix - 1)) { + insertTwiddleKernel(localString, N[r], numIter, Nprev, len, numWorkItemsPerXForm); + lMemSize = getPadding(numWorkItemsPerXForm, Nprev, numWorkItemsReq, numXFormsPerWG, N[r], plan->num_local_mem_banks, &offset, &midPad); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + insertLocalStoreIndexArithmatic(localString, numWorkItemsReq, numXFormsPerWG, N[r], offset, midPad); + insertLocalLoadIndexArithmatic(localString, Nprev, N[r], numWorkItemsReq, numWorkItemsPerXForm, numXFormsPerWG, offset, midPad); + insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); + insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, xcomp); + insertLocalStores(localString, numIter, N[r], numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); + insertLocalLoads(localString, n, N[r], N[r+1], Nprev, Ncurr, numWorkItemsPerXForm, numWorkItemsReq, offset, ycomp); + Nprev = Ncurr; + len = len / N[r]; + } + } + + lMemSize = insertGlobalStoresAndTranspose(localString, n, maxRadix, N[numRadix - 1], numWorkItemsPerXForm, numXFormsPerWG, plan->min_mem_coalesce_width, dataFormat); + (*kInfo)->lmem_size = (lMemSize > (*kInfo)->lmem_size) ? lMemSize : (*kInfo)->lmem_size; + + insertHeader(*kernelString, kernelName, dataFormat); + *kernelString += string("{\n"); + if((*kInfo)->lmem_size) + *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); + *kernelString += localString; + *kernelString += string("}\n"); +} + +// For n larger than what can be computed using local memory fft, global transposes +// multiple kernel launces is needed. For these sizes, n can be decomposed using +// much larger base radices i.e. say n = 262144 = 128 x 64 x 32. Thus three kernel +// launches will be needed, first computing 64 x 32, length 128 ffts, second computing +// 128 x 32 length 64 ffts, and finally a kernel computing 128 x 64 length 32 ffts. +// Each of these base radices can futher be divided into factors so that each of these +// base ffts can be computed within one kernel launch using in-register ffts and local +// memory transposes i.e for the first kernel above which computes 64 x 32 ffts on length +// 128, 128 can be decomposed into 128 = 16 x 8 i.e. 8 work items can compute 8 length +// 16 ffts followed by transpose using local memory followed by each of these eight +// work items computing 2 length 8 ffts thus computing 16 length 8 ffts in total. This +// means only 8 work items are needed for computing one length 128 fft. If we choose +// work group size of say 64, we can compute 64/8 = 8 length 128 ffts within one +// work group. Since we need to compute 64 x 32 length 128 ffts in first kernel, this +// means we need to launch 64 x 32 / 8 = 256 work groups with 64 work items in each +// work group where each work group is computing 8 length 128 ffts where each length +// 128 fft is computed by 8 work items. Same logic can be applied to other two kernels +// in this example. Users can play with difference base radices and difference +// decompositions of base radices to generates different kernels and see which gives +// best performance. Following function is just fixed to use 128 as base radix + +void +getGlobalRadixInfo(int n, int *radix, int *R1, int *R2, int *numRadices) +{ + int baseRadix = min(n, 128); + + int numR = 0; + int N = n; + while(N > baseRadix) + { + N /= baseRadix; + numR++; + } + + for(int i = 0; i < numR; i++) + radix[i] = baseRadix; + + radix[numR] = N; + numR++; + *numRadices = numR; + + for(int i = 0; i < numR; i++) + { + int B = radix[i]; + if(B <= 8) + { + R1[i] = B; + R2[i] = 1; + continue; + } + + int r1 = 2; + int r2 = B / r1; + while(r2 > r1) + { + r1 *=2; + r2 = B / r1; + } + R1[i] = r1; + R2[i] = r2; + } +} + +static void +createGlobalFFTKernelString(cl_fft_plan *plan, int n, int BS, cl_fft_kernel_dir dir, int vertBS) +{ + int i, j, k, t; + int radixArr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int R1Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int R2Arr[10] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + int radix, R1, R2; + int numRadices; + + int maxThreadsPerBlock = plan->max_work_item_per_workgroup; + int maxArrayLen = plan->max_radix; + int batchSize = plan->min_mem_coalesce_width; + clFFT_DataFormat dataFormat = plan->format; + int vertical = (dir == cl_fft_kernel_x) ? 0 : 1; + + getGlobalRadixInfo(n, radixArr, R1Arr, R2Arr, &numRadices); + + int numPasses = numRadices; + + string localString(""), kernelName(""); + string *kernelString = plan->kernel_string; + cl_fft_kernel_info **kInfo = &plan->kernel_info; + int kCount = 0; + + while(*kInfo) + { + kInfo = &(*kInfo)->next; + kCount++; + } + + int N = n; + int m = (int)log2(n); + int Rinit = vertical ? BS : 1; + batchSize = vertical ? min(BS, batchSize) : batchSize; + int passNum; + + for(passNum = 0; passNum < numPasses; passNum++) + { + + localString.clear(); + kernelName.clear(); + + radix = radixArr[passNum]; + R1 = R1Arr[passNum]; + R2 = R2Arr[passNum]; + + int strideI = Rinit; + for(i = 0; i < numPasses; i++) + if(i != passNum) + strideI *= radixArr[i]; + + int strideO = Rinit; + for(i = 0; i < passNum; i++) + strideO *= radixArr[i]; + + int threadsPerXForm = R2; + batchSize = R2 == 1 ? plan->max_work_item_per_workgroup : batchSize; + batchSize = min(batchSize, strideI); + int threadsPerBlock = batchSize * threadsPerXForm; + threadsPerBlock = min(threadsPerBlock, maxThreadsPerBlock); + batchSize = threadsPerBlock / threadsPerXForm; + assert(R2 <= R1); + assert(R1*R2 == radix); + assert(R1 <= maxArrayLen); + assert(threadsPerBlock <= maxThreadsPerBlock); + + int numIter = R1 / R2; + int gInInc = threadsPerBlock / batchSize; + + + int lgStrideO = (int)log2(strideO); + int numBlocksPerXForm = strideI / batchSize; + int numBlocks = numBlocksPerXForm; + if(!vertical) + numBlocks *= BS; + else + numBlocks *= vertBS; + + kernelName = string("fft") + num2str(kCount); + *kInfo = (cl_fft_kernel_info *) malloc(sizeof(cl_fft_kernel_info)); + (*kInfo)->kernel = 0; + if(R2 == 1) + (*kInfo)->lmem_size = 0; + else + { + if(strideO == 1) + (*kInfo)->lmem_size = (radix + 1)*batchSize; + else + (*kInfo)->lmem_size = threadsPerBlock*R1; + } + (*kInfo)->num_workgroups = numBlocks; + (*kInfo)->num_xforms_per_workgroup = 1; + (*kInfo)->num_workitems_per_workgroup = threadsPerBlock; + (*kInfo)->dir = dir; + if( (passNum == (numPasses - 1)) && (numPasses & 1) ) + (*kInfo)->in_place_possible = 1; + else + (*kInfo)->in_place_possible = 0; + (*kInfo)->next = NULL; + (*kInfo)->kernel_name = (char *) malloc(sizeof(char)*(kernelName.size()+1)); + strcpy((*kInfo)->kernel_name, kernelName.c_str()); + + insertVariables(localString, R1); + + if(vertical) + { + localString += string("xNum = groupId >> ") + num2str((int)log2(numBlocksPerXForm)) + string(";\n"); + localString += string("groupId = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); + localString += string("indexIn = mad24(groupId, ") + num2str(batchSize) + string(", xNum << ") + num2str((int)log2(n*BS)) + string(");\n"); + localString += string("tid = mul24(groupId, ") + num2str(batchSize) + string(");\n"); + localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); + int stride = radix*Rinit; + for(i = 0; i < passNum; i++) + stride *= radixArr[i]; + localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j + ") + string("(xNum << ") + num2str((int) log2(n*BS)) + string("));\n"); + localString += string("bNum = groupId;\n"); + } + else + { + int lgNumBlocksPerXForm = (int)log2(numBlocksPerXForm); + localString += string("bNum = groupId & ") + num2str(numBlocksPerXForm - 1) + string(";\n"); + localString += string("xNum = groupId >> ") + num2str(lgNumBlocksPerXForm) + string(";\n"); + localString += string("indexIn = mul24(bNum, ") + num2str(batchSize) + string(");\n"); + localString += string("tid = indexIn;\n"); + localString += string("i = tid >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("j = tid & ") + num2str(strideO - 1) + string(";\n"); + int stride = radix*Rinit; + for(i = 0; i < passNum; i++) + stride *= radixArr[i]; + localString += string("indexOut = mad24(i, ") + num2str(stride) + string(", j);\n"); + localString += string("indexIn += (xNum << ") + num2str(m) + string(");\n"); + localString += string("indexOut += (xNum << ") + num2str(m) + string(");\n"); + } + + // Load Data + int lgBatchSize = (int)log2(batchSize); + localString += string("tid = lId;\n"); + localString += string("i = tid & ") + num2str(batchSize - 1) + string(";\n"); + localString += string("j = tid >> ") + num2str(lgBatchSize) + string(";\n"); + localString += string("indexIn += mad24(j, ") + num2str(strideI) + string(", i);\n"); + + if(dataFormat == clFFT_SplitComplexFormat) + { + localString += string("in_real += indexIn;\n"); + localString += string("in_imag += indexIn;\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("].x = in_real[") + num2str(j*gInInc*strideI) + string("];\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("].y = in_imag[") + num2str(j*gInInc*strideI) + string("];\n"); + } + else + { + localString += string("in += indexIn;\n"); + for(j = 0; j < R1; j++) + localString += string("a[") + num2str(j) + string("] = in[") + num2str(j*gInInc*strideI) + string("];\n"); + } + + localString += string("fftKernel") + num2str(R1) + string("(a, dir);\n"); + + if(R2 > 1) + { + // twiddle + for(k = 1; k < R1; k++) + { + localString += string("ang = dir*(2.0f*M_PI*") + num2str(k) + string("/") + num2str(radix) + string(")*j;\n"); + localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + localString += string("a[") + num2str(k) + string("] = complexMul(a[") + num2str(k) + string("], w);\n"); + } + + // shuffle + numIter = R1 / R2; + localString += string("indexIn = mad24(j, ") + num2str(threadsPerBlock*numIter) + string(", i);\n"); + localString += string("lMemStore = sMem + tid;\n"); + localString += string("lMemLoad = sMem + indexIn;\n"); + for(k = 0; k < R1; k++) + localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < numIter; k++) + for(t = 0; t < R2; t++) + localString += string("a[") + num2str(k*R2+t) + string("].x = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < R1; k++) + localString += string("lMemStore[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + for(k = 0; k < numIter; k++) + for(t = 0; t < R2; t++) + localString += string("a[") + num2str(k*R2+t) + string("].y = lMemLoad[") + num2str(t*batchSize + k*threadsPerBlock) + string("];\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + for(j = 0; j < numIter; j++) + localString += string("fftKernel") + num2str(R2) + string("(a + ") + num2str(j*R2) + string(", dir);\n"); + } + + // twiddle + if(passNum < (numPasses - 1)) + { + localString += string("l = ((bNum << ") + num2str(lgBatchSize) + string(") + i) >> ") + num2str(lgStrideO) + string(";\n"); + localString += string("k = j << ") + num2str((int)log2(R1/R2)) + string(";\n"); + localString += string("ang1 = dir*(2.0f*M_PI/") + num2str(N) + string(")*l;\n"); + for(t = 0; t < R1; t++) + { + localString += string("ang = ang1*(k + ") + num2str((t%R2)*R1 + (t/R2)) + string(");\n"); + localString += string("w = (float2)(native_cos(ang), native_sin(ang));\n"); + localString += string("a[") + num2str(t) + string("] = complexMul(a[") + num2str(t) + string("], w);\n"); + } + } + + // Store Data + if(strideO == 1) + { + + localString += string("lMemStore = sMem + mad24(i, ") + num2str(radix + 1) + string(", j << ") + num2str((int)log2(R1/R2)) + string(");\n"); + localString += string("lMemLoad = sMem + mad24(tid >> ") + num2str((int)log2(radix)) + string(", ") + num2str(radix+1) + string(", tid & ") + num2str(radix-1) + string(");\n"); + + for(i = 0; i < R1/R2; i++) + for(j = 0; j < R2; j++) + localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].x;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + if(threadsPerBlock >= radix) + { + for(i = 0; i < R1; i++) + localString += string("a[") + num2str(i) + string("].x = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); + } + else + { + int innerIter = radix/threadsPerBlock; + int outerIter = R1/innerIter; + for(i = 0; i < outerIter; i++) + for(j = 0; j < innerIter; j++) + localString += string("a[") + num2str(i*innerIter+j) + string("].x = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); + } + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + for(i = 0; i < R1/R2; i++) + for(j = 0; j < R2; j++) + localString += string("lMemStore[ ") + num2str(i + j*R1) + string("] = a[") + num2str(i*R2+j) + string("].y;\n"); + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + if(threadsPerBlock >= radix) + { + for(i = 0; i < R1; i++) + localString += string("a[") + num2str(i) + string("].y = lMemLoad[") + num2str(i*(radix+1)*(threadsPerBlock/radix)) + string("];\n"); + } + else + { + int innerIter = radix/threadsPerBlock; + int outerIter = R1/innerIter; + for(i = 0; i < outerIter; i++) + for(j = 0; j < innerIter; j++) + localString += string("a[") + num2str(i*innerIter+j) + string("].y = lMemLoad[") + num2str(j*threadsPerBlock + i*(radix+1)) + string("];\n"); + } + localString += string("barrier(CLK_LOCAL_MEM_FENCE);\n"); + + localString += string("indexOut += tid;\n"); + if(dataFormat == clFFT_SplitComplexFormat) { + localString += string("out_real += indexOut;\n"); + localString += string("out_imag += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out_real[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].x;\n"); + for(k = 0; k < R1; k++) + localString += string("out_imag[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("].y;\n"); + } + else { + localString += string("out += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out[") + num2str(k*threadsPerBlock) + string("] = a[") + num2str(k) + string("];\n"); + } + + } + else + { + localString += string("indexOut += mad24(j, ") + num2str(numIter*strideO) + string(", i);\n"); + if(dataFormat == clFFT_SplitComplexFormat) { + localString += string("out_real += indexOut;\n"); + localString += string("out_imag += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out_real[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].x;\n"); + for(k = 0; k < R1; k++) + localString += string("out_imag[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("].y;\n"); + } + else { + localString += string("out += indexOut;\n"); + for(k = 0; k < R1; k++) + localString += string("out[") + num2str(((k%R2)*R1 + (k/R2))*strideO) + string("] = a[") + num2str(k) + string("];\n"); + } + } + + insertHeader(*kernelString, kernelName, dataFormat); + *kernelString += string("{\n"); + if((*kInfo)->lmem_size) + *kernelString += string(" __local float sMem[") + num2str((*kInfo)->lmem_size) + string("];\n"); + *kernelString += localString; + *kernelString += string("}\n"); + + N /= radix; + kInfo = &(*kInfo)->next; + kCount++; + } +} + +void FFT1D(cl_fft_plan *plan, cl_fft_kernel_dir dir) +{ + unsigned int radixArray[10]; + unsigned int numRadix; + + switch(dir) + { + case cl_fft_kernel_x: + if(plan->n.x > plan->max_localmem_fft_size) + { + createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); + } + else if(plan->n.x > 1) + { + getRadixArray(plan->n.x, radixArray, &numRadix, 0); + if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) + { + createLocalMemfftKernelString(plan); + } + else + { + getRadixArray(plan->n.x, radixArray, &numRadix, plan->max_radix); + if(plan->n.x / radixArray[0] <= plan->max_work_item_per_workgroup) + createLocalMemfftKernelString(plan); + else + createGlobalFFTKernelString(plan, plan->n.x, 1, cl_fft_kernel_x, 1); + } + } + break; + + case cl_fft_kernel_y: + if(plan->n.y > 1) + createGlobalFFTKernelString(plan, plan->n.y, plan->n.x, cl_fft_kernel_y, 1); + break; + + case cl_fft_kernel_z: + if(plan->n.z > 1) + createGlobalFFTKernelString(plan, plan->n.z, plan->n.x*plan->n.y, cl_fft_kernel_z, 1); + default: + return; + } +} + diff --git a/src/algorithms/libs/fft_setup.cc b/src/algorithms/libs/fft_setup.cc new file mode 100644 index 000000000..6d2025d66 --- /dev/null +++ b/src/algorithms/libs/fft_setup.cc @@ -0,0 +1,402 @@ + +// +// File: fft_setup.cpp +// +// Version: <1.0> +// +// Disclaimer: IMPORTANT: This Apple software is supplied to you by Apple Inc. ("Apple") +// in consideration of your agreement to the following terms, and your use, +// installation, modification or redistribution of this Apple software +// constitutes acceptance of these terms. If you do not agree with these +// terms, please do not use, install, modify or redistribute this Apple +// software. +// +// In consideration of your agreement to abide by the following terms, and +// subject to these terms, Apple grants you a personal, non - exclusive +// license, under Apple's copyrights in this original Apple software ( the +// "Apple Software" ), to use, reproduce, modify and redistribute the Apple +// Software, with or without modifications, in source and / or binary forms; +// provided that if you redistribute the Apple Software in its entirety and +// without modifications, you must retain this notice and the following text +// and disclaimers in all such redistributions of the Apple Software. Neither +// the name, trademarks, service marks or logos of Apple Inc. may be used to +// endorse or promote products derived from the Apple Software without specific +// prior written permission from Apple. Except as expressly stated in this +// notice, no other rights or licenses, express or implied, are granted by +// Apple herein, including but not limited to any patent rights that may be +// infringed by your derivative works or by other works in which the Apple +// Software may be incorporated. +// +// The Apple Software is provided by Apple on an "AS IS" basis. APPLE MAKES NO +// WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED +// WARRANTIES OF NON - INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND OPERATION +// ALONE OR IN COMBINATION WITH YOUR PRODUCTS. +// +// IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR +// CONSEQUENTIAL DAMAGES ( INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION ) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION +// AND / OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED AND WHETHER +// UNDER THEORY OF CONTRACT, TORT ( INCLUDING NEGLIGENCE ), STRICT LIABILITY OR +// OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright ( C ) 2008 Apple Inc. All Rights Reserved. +// +//////////////////////////////////////////////////////////////////////////////////////////////////// + + +#include "fft_internal.h" +#include "fft_base_kernels.h" +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +extern void getKernelWorkDimensions(cl_fft_plan *plan, cl_fft_kernel_info *kernelInfo, cl_int *batchSize, size_t *gWorkItems, size_t *lWorkItems); + +static void +getBlockConfigAndKernelString(cl_fft_plan *plan) +{ + plan->temp_buffer_needed = 0; + *plan->kernel_string += baseKernels; + + if(plan->format == clFFT_SplitComplexFormat) + *plan->kernel_string += twistKernelPlannar; + else + *plan->kernel_string += twistKernelInterleaved; + + switch(plan->dim) + { + case clFFT_1D: + FFT1D(plan, cl_fft_kernel_x); + break; + + case clFFT_2D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + break; + + case clFFT_3D: + FFT1D(plan, cl_fft_kernel_x); + FFT1D(plan, cl_fft_kernel_y); + FFT1D(plan, cl_fft_kernel_z); + break; + + default: + return; + } + + plan->temp_buffer_needed = 0; + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->temp_buffer_needed |= !kInfo->in_place_possible; + kInfo = kInfo->next; + } +} + + +static void +deleteKernelInfo(cl_fft_kernel_info *kInfo) +{ + if(kInfo) + { + if(kInfo->kernel_name) + free(kInfo->kernel_name); + if(kInfo->kernel) + clReleaseKernel(kInfo->kernel); + free(kInfo); + } +} + +static void +destroy_plan(cl_fft_plan *Plan) +{ + cl_fft_kernel_info *kernel_info = Plan->kernel_info; + + while(kernel_info) + { + cl_fft_kernel_info *tmp = kernel_info->next; + deleteKernelInfo(kernel_info); + kernel_info = tmp; + } + + Plan->kernel_info = NULL; + + if(Plan->kernel_string) + { + delete Plan->kernel_string; + Plan->kernel_string = NULL; + } + if(Plan->twist_kernel) + { + clReleaseKernel(Plan->twist_kernel); + Plan->twist_kernel = NULL; + } + if(Plan->program) + { + clReleaseProgram(Plan->program); + Plan->program = NULL; + } + if(Plan->tempmemobj) + { + clReleaseMemObject(Plan->tempmemobj); + Plan->tempmemobj = NULL; + } + if(Plan->tempmemobj_real) + { + clReleaseMemObject(Plan->tempmemobj_real); + Plan->tempmemobj_real = NULL; + } + if(Plan->tempmemobj_imag) + { + clReleaseMemObject(Plan->tempmemobj_imag); + Plan->tempmemobj_imag = NULL; + } +} + +static int +createKernelList(cl_fft_plan *plan) +{ + cl_program program = plan->program; + cl_fft_kernel_info *kernel_info = plan->kernel_info; + + cl_int err; + while(kernel_info) + { + kernel_info->kernel = clCreateKernel(program, kernel_info->kernel_name, &err); + if(!kernel_info->kernel || err != CL_SUCCESS) + return err; + kernel_info = kernel_info->next; + } + + if(plan->format == clFFT_SplitComplexFormat) + plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistSplit", &err); + else + plan->twist_kernel = clCreateKernel(program, "clFFT_1DTwistInterleaved", &err); + + if(!plan->twist_kernel || err) + return err; + + return CL_SUCCESS; +} + +int getMaxKernelWorkGroupSize(cl_fft_plan *plan, unsigned int *max_wg_size, unsigned int num_devices, cl_device_id *devices) +{ + int reg_needed = 0; + *max_wg_size = std::numeric_limits::max(); + int err; + unsigned wg_size; + + unsigned int i; + for(i = 0; i < num_devices; i++) + { + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + err = clGetKernelWorkGroupInfo(kInfo->kernel, devices[i], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wg_size, NULL); + if(err != CL_SUCCESS) + return -1; + + if(wg_size < kInfo->num_workitems_per_workgroup) + reg_needed |= 1; + + if(*max_wg_size > wg_size) + *max_wg_size = wg_size; + + kInfo = kInfo->next; + } + } + + return reg_needed; +} + +#define ERR_MACRO(err) { \ + if( err != CL_SUCCESS) \ + { \ + if(error_code) \ + *error_code = err; \ + clFFT_DestroyPlan((clFFT_Plan) plan); \ + return (clFFT_Plan) NULL; \ + } \ + } + +clFFT_Plan +clFFT_CreatePlan(cl_context context, clFFT_Dim3 n, clFFT_Dimension dim, clFFT_DataFormat dataFormat, cl_int *error_code ) +{ + int i; + cl_int err; + int isPow2 = 1; + cl_fft_plan *plan = NULL; + ostringstream kString; + int num_devices; + int gpu_found = 0; + cl_device_id devices[16]; + size_t ret_size; + cl_device_type device_type; + + if(!context) + ERR_MACRO(CL_INVALID_VALUE); + + isPow2 |= n.x && !( (n.x - 1) & n.x ); + isPow2 |= n.y && !( (n.y - 1) & n.y ); + isPow2 |= n.z && !( (n.z - 1) & n.z ); + + if(!isPow2) + ERR_MACRO(CL_INVALID_VALUE); + + if( (dim == clFFT_1D && (n.y != 1 || n.z != 1)) || (dim == clFFT_2D && n.z != 1) ) + ERR_MACRO(CL_INVALID_VALUE); + + plan = (cl_fft_plan *) malloc(sizeof(cl_fft_plan)); + if(!plan) + ERR_MACRO(CL_OUT_OF_RESOURCES); + + plan->context = context; + clRetainContext(context); + plan->n = n; + plan->dim = dim; + plan->format = dataFormat; + plan->kernel_info = 0; + plan->num_kernels = 0; + plan->twist_kernel = 0; + plan->program = 0; + plan->temp_buffer_needed = 0; + plan->last_batch_size = 0; + plan->tempmemobj = 0; + plan->tempmemobj_real = 0; + plan->tempmemobj_imag = 0; + plan->max_localmem_fft_size = 2048; + plan->max_work_item_per_workgroup = 256; + plan->max_radix = 16; + plan->min_mem_coalesce_width = 16; + plan->num_local_mem_banks = 16; + +patch_kernel_source: + + plan->kernel_string = new string(""); + if(!plan->kernel_string) + ERR_MACRO(CL_OUT_OF_RESOURCES); + + getBlockConfigAndKernelString(plan); + + const char *source_str = plan->kernel_string->c_str(); + plan->program = clCreateProgramWithSource(context, 1, (const char**) &source_str, NULL, &err); + ERR_MACRO(err); + + err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &ret_size); + ERR_MACRO(err); + + num_devices = (int)(ret_size / sizeof(cl_device_id)); + + for(i = 0; i < num_devices; i++) + { + err = clGetDeviceInfo(devices[i], CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL); + ERR_MACRO(err); + + if(device_type == CL_DEVICE_TYPE_GPU) + { + gpu_found = 1; + err = clBuildProgram(plan->program, 1, &devices[i], "-cl-mad-enable", NULL, NULL); + if (err != CL_SUCCESS) + { + char *build_log; + char devicename[200]; + size_t log_size; + + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + ERR_MACRO(err); + + build_log = (char *) malloc(log_size + 1); + + err = clGetProgramBuildInfo(plan->program, devices[i], CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); + ERR_MACRO(err); + + err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(devicename), devicename, NULL); + ERR_MACRO(err); + + fprintf(stdout, "FFT program build log on device %s\n", devicename); + fprintf(stdout, "%s\n", build_log); + free(build_log); + + ERR_MACRO(err); + } + } + } + + if(!gpu_found) + ERR_MACRO(CL_INVALID_CONTEXT); + + err = createKernelList(plan); + ERR_MACRO(err); + + // we created program and kernels based on "some max work group size (default 256)" ... this work group size + // may be larger than what kernel may execute with ... if thats the case we need to regenerate the kernel source + // setting this as limit i.e max group size and rebuild. + unsigned int max_kernel_wg_size; + int patching_req = getMaxKernelWorkGroupSize(plan, &max_kernel_wg_size, num_devices, devices); + if(patching_req == -1) + { + ERR_MACRO(err); + } + + if(patching_req) + { + destroy_plan(plan); + plan->max_work_item_per_workgroup = max_kernel_wg_size; + goto patch_kernel_source; + } + + cl_fft_kernel_info *kInfo = plan->kernel_info; + while(kInfo) + { + plan->num_kernels++; + kInfo = kInfo->next; + } + + if(error_code) + *error_code = CL_SUCCESS; + + return (clFFT_Plan) plan; +} + +void +clFFT_DestroyPlan(clFFT_Plan plan) +{ + cl_fft_plan *Plan = (cl_fft_plan *) plan; + if(Plan) + { + destroy_plan(Plan); + clReleaseContext(Plan->context); + free(Plan); + } +} + +void clFFT_DumpPlan( clFFT_Plan Plan, FILE *file) +{ + size_t gDim, lDim; + FILE *out; + if(!file) + out = stdout; + else + out = file; + + cl_fft_plan *plan = (cl_fft_plan *) Plan; + cl_fft_kernel_info *kInfo = plan->kernel_info; + + while(kInfo) + { + cl_int s = 1; + getKernelWorkDimensions(plan, kInfo, &s, &gDim, &lDim); + fprintf(out, "Run kernel %s with global dim = {%zd*BatchSize}, local dim={%zd}\n", kInfo->kernel_name, gDim, lDim); + kInfo = kInfo->next; + } + fprintf(out, "%s\n", plan->kernel_string->c_str()); +} diff --git a/src/algorithms/libs/galileo_e1_signal_processing.cc b/src/algorithms/libs/galileo_e1_signal_processing.cc index 9a0c2f895..2ad474e90 100644 --- a/src/algorithms/libs/galileo_e1_signal_processing.cc +++ b/src/algorithms/libs/galileo_e1_signal_processing.cc @@ -158,7 +158,7 @@ galileo_e1_code_gen_complex_sampled(std::complex* _dest, char _Signal[3], std::string _galileo_signal = _Signal; unsigned int _samplesPerCode; - const unsigned int _codeFreqBasis = Galileo_E1_CODE_CHIP_RATE_HZ; //Hz + const int _codeFreqBasis = Galileo_E1_CODE_CHIP_RATE_HZ; //Hz unsigned int _codeLength = Galileo_E1_B_CODE_LENGTH_CHIPS; int primary_code_E1_chips[(int)Galileo_E1_B_CODE_LENGTH_CHIPS]; _samplesPerCode = round(_fs / (_codeFreqBasis / _codeLength)); diff --git a/src/core/receiver/gnss_block_factory.cc b/src/core/receiver/gnss_block_factory.cc index 6d83544d3..1050991cb 100644 --- a/src/core/receiver/gnss_block_factory.cc +++ b/src/core/receiver/gnss_block_factory.cc @@ -54,6 +54,7 @@ #include "fir_filter.h" #include "freq_xlating_fir_filter.h" #include "gps_l1_ca_pcps_acquisition.h" +#include "gps_l1_ca_pcps_multithread_acquisition.h" #include "gps_l1_ca_pcps_tong_acquisition.h" #include "gps_l1_ca_pcps_assisted_acquisition.h" #include "gps_l1_ca_pcps_acquisition_fine_doppler.h" @@ -74,6 +75,10 @@ #include "gps_l1_ca_pvt.h" #include "galileo_e1_pvt.h" +#if OPENCL + #include "gps_l1_ca_pcps_opencl_acquisition.h" +#endif + #if GN3S_DRIVER #include "gn3s_signal_source.h" #endif @@ -346,9 +351,18 @@ GNSSBlockInterface* GNSSBlockFactory::GetBlock( } else if (implementation.compare("GPS_L1_CA_PCPS_Multithread_Acquisition") == 0) { - block = new GpsL1CaPcpsAcquisition(configuration, role, in_streams, + block = new GpsL1CaPcpsMultithreadAcquisition(configuration, role, in_streams, out_streams, queue); } + +#if OPENCL + else if (implementation.compare("GPS_L1_CA_PCPS_OpenCl_Acquisition") == 0) + { + block = new GpsL1CaPcpsOpenClAcquisition(configuration, role, in_streams, + out_streams, queue); + } +#endif + else if (implementation.compare("GPS_L1_CA_PCPS_Acquisition_Fine_Doppler") == 0) { block = new GpsL1CaPcpsAcquisitionFineDoppler(configuration, role, in_streams, diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 8dbb1cc4c..04d079c46 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -129,6 +129,7 @@ add_executable(gnss_block_test EXCLUDE_FROM_ALL ${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_acquisition_test.cc ${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc ${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc + ${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc ${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc ${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/galileo_e1_pcps_ambiguous_acquisition_test.cc ${CMAKE_CURRENT_SOURCE_DIR}/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc_test.cc diff --git a/src/tests/gnss_block/galileo_e1_pcps_8ms_ambiguous_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/galileo_e1_pcps_8ms_ambiguous_acquisition_gsoc2013_test.cc index d8272f0f1..041665ee6 100644 --- a/src/tests/gnss_block/galileo_e1_pcps_8ms_ambiguous_acquisition_gsoc2013_test.cc +++ b/src/tests/gnss_block/galileo_e1_pcps_8ms_ambiguous_acquisition_gsoc2013_test.cc @@ -238,14 +238,14 @@ void GalileoE1Pcps8msAmbiguousAcquisitionGSoC2013Test::config_2() config->set_property("SignalSource.doppler_Hz_1", "1000"); config->set_property("SignalSource.delay_chips_1", "100"); - config->set_property("SignalSource.system_2", "G"); - config->set_property("SignalSource.PRN_2", "10"); + config->set_property("SignalSource.system_2", "E"); + config->set_property("SignalSource.PRN_2", "21"); config->set_property("SignalSource.CN0_dB_2", "44"); config->set_property("SignalSource.doppler_Hz_2", "2000"); config->set_property("SignalSource.delay_chips_2", "200"); - config->set_property("SignalSource.system_3", "G"); - config->set_property("SignalSource.PRN_3", "20"); + config->set_property("SignalSource.system_3", "E"); + config->set_property("SignalSource.PRN_3", "22"); config->set_property("SignalSource.CN0_dB_3", "44"); config->set_property("SignalSource.doppler_Hz_3", "3000"); config->set_property("SignalSource.delay_chips_3", "300"); @@ -279,7 +279,7 @@ void GalileoE1Pcps8msAmbiguousAcquisitionGSoC2013Test::config_2() std::to_string(integration_time_ms)); config->set_property("Acquisition.max_dwells", "1"); config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_8ms_Ambiguous_Acquisition"); - config->set_property("Acquisition.pfa", "1e-1"); + config->set_property("Acquisition.pfa", "0.1"); config->set_property("Acquisition.doppler_max", "10000"); config->set_property("Acquisition.doppler_step", "250"); config->set_property("Acquisition.dump", "false"); diff --git a/src/tests/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc2013_test.cc index 8229a4596..00be9fb53 100644 --- a/src/tests/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc2013_test.cc +++ b/src/tests/gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc2013_test.cc @@ -242,14 +242,14 @@ void GalileoE1PcpsAmbiguousAcquisitionGSoC2013Test::config_2() config->set_property("SignalSource.doppler_Hz_1", "1000"); config->set_property("SignalSource.delay_chips_1", "100"); - config->set_property("SignalSource.system_2", "G"); - config->set_property("SignalSource.PRN_2", "10"); + config->set_property("SignalSource.system_2", "E"); + config->set_property("SignalSource.PRN_2", "21"); config->set_property("SignalSource.CN0_dB_2", "44"); config->set_property("SignalSource.doppler_Hz_2", "2000"); config->set_property("SignalSource.delay_chips_2", "200"); - config->set_property("SignalSource.system_3", "G"); - config->set_property("SignalSource.PRN_3", "20"); + config->set_property("SignalSource.system_3", "E"); + config->set_property("SignalSource.PRN_3", "22"); config->set_property("SignalSource.CN0_dB_3", "44"); config->set_property("SignalSource.doppler_Hz_3", "3000"); config->set_property("SignalSource.delay_chips_3", "300"); @@ -284,7 +284,7 @@ void GalileoE1PcpsAmbiguousAcquisitionGSoC2013Test::config_2() config->set_property("Acquisition.max_dwells", "1"); config->set_property("Acquisition.bit_transition_flag","false"); config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_Ambiguous_Acquisition"); - config->set_property("Acquisition.pfa", "1e-1"); + config->set_property("Acquisition.pfa", "0.1"); config->set_property("Acquisition.doppler_max", "10000"); config->set_property("Acquisition.doppler_step", "250"); config->set_property("Acquisition.dump", "false"); diff --git a/src/tests/gnss_block/galileo_e1_pcps_cccwsr_ambiguous_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/galileo_e1_pcps_cccwsr_ambiguous_acquisition_gsoc2013_test.cc index 99cbe8f52..cddb19a39 100644 --- a/src/tests/gnss_block/galileo_e1_pcps_cccwsr_ambiguous_acquisition_gsoc2013_test.cc +++ b/src/tests/gnss_block/galileo_e1_pcps_cccwsr_ambiguous_acquisition_gsoc2013_test.cc @@ -240,14 +240,14 @@ void GalileoE1PcpsCccwsrAmbiguousAcquisitionTest::config_2() config->set_property("SignalSource.doppler_Hz_1", "1000"); config->set_property("SignalSource.delay_chips_1", "100"); - config->set_property("SignalSource.system_2", "G"); - config->set_property("SignalSource.PRN_2", "10"); + config->set_property("SignalSource.system_2", "E"); + config->set_property("SignalSource.PRN_2", "21"); config->set_property("SignalSource.CN0_dB_2", "44"); config->set_property("SignalSource.doppler_Hz_2", "2000"); config->set_property("SignalSource.delay_chips_2", "200"); - config->set_property("SignalSource.system_3", "G"); - config->set_property("SignalSource.PRN_3", "20"); + config->set_property("SignalSource.system_3", "E"); + config->set_property("SignalSource.PRN_3", "22"); config->set_property("SignalSource.CN0_dB_3", "44"); config->set_property("SignalSource.doppler_Hz_3", "3000"); config->set_property("SignalSource.delay_chips_3", "300"); @@ -281,7 +281,7 @@ void GalileoE1PcpsCccwsrAmbiguousAcquisitionTest::config_2() std::to_string(integration_time_ms)); config->set_property("Acquisition.max_dwells", "1"); config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_CCCWSR_Ambiguous_Acquisition"); - config->set_property("Acquisition.threshold", "0.0025"); + config->set_property("Acquisition.threshold", "0.00215"); // Pfa,a = 0.1 config->set_property("Acquisition.doppler_max", "10000"); config->set_property("Acquisition.doppler_step", "250"); config->set_property("Acquisition.dump", "false"); @@ -536,7 +536,7 @@ TEST_F(GalileoE1PcpsCccwsrAmbiguousAcquisitionTest, ValidationOfResultsProbabili top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0); }) << "Failure connecting the blocks of acquisition test." << std::endl; - std::cout << "Probability of false alarm (target) = " << 0.0065 << std::endl; + std::cout << "Probability of false alarm (target) = " << 0.1 << std::endl; // i = 0 --> sallite in acquisition is visible (prob of detection and prob of detection with wrong estimation) // i = 1 --> satellite in acquisition is not visible (prob of false detection) diff --git a/src/tests/gnss_block/galileo_e1_pcps_tong_ambiguous_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/galileo_e1_pcps_tong_ambiguous_acquisition_gsoc2013_test.cc index 7db6cf877..86aa1cc08 100644 --- a/src/tests/gnss_block/galileo_e1_pcps_tong_ambiguous_acquisition_gsoc2013_test.cc +++ b/src/tests/gnss_block/galileo_e1_pcps_tong_ambiguous_acquisition_gsoc2013_test.cc @@ -191,8 +191,8 @@ void GalileoE1PcpsTongAmbiguousAcquisitionGSoC2013Test::config_1() config->set_property("Acquisition.if", "0"); config->set_property("Acquisition.coherent_integration_time_ms", std::to_string(integration_time_ms)); - config->set_property("Acquisition.tong_init_val", "5"); - config->set_property("Acquisition.tong_max_val", "10"); + config->set_property("Acquisition.tong_init_val", "1"); + config->set_property("Acquisition.tong_max_val", "8"); config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_Tong_Ambiguous_Acquisition"); config->set_property("Acquisition.threshold", "0.3"); config->set_property("Acquisition.doppler_max", "10000"); @@ -241,14 +241,14 @@ void GalileoE1PcpsTongAmbiguousAcquisitionGSoC2013Test::config_2() config->set_property("SignalSource.doppler_Hz_1", "1000"); config->set_property("SignalSource.delay_chips_1", "100"); - config->set_property("SignalSource.system_2", "G"); - config->set_property("SignalSource.PRN_2", "10"); + config->set_property("SignalSource.system_2", "E"); + config->set_property("SignalSource.PRN_2", "21"); config->set_property("SignalSource.CN0_dB_2", "44"); config->set_property("SignalSource.doppler_Hz_2", "2000"); config->set_property("SignalSource.delay_chips_2", "200"); - config->set_property("SignalSource.system_3", "G"); - config->set_property("SignalSource.PRN_3", "20"); + config->set_property("SignalSource.system_3", "E"); + config->set_property("SignalSource.PRN_3", "22"); config->set_property("SignalSource.CN0_dB_3", "44"); config->set_property("SignalSource.doppler_Hz_3", "3000"); config->set_property("SignalSource.delay_chips_3", "300"); @@ -280,10 +280,10 @@ void GalileoE1PcpsTongAmbiguousAcquisitionGSoC2013Test::config_2() config->set_property("Acquisition.if", "0"); config->set_property("Acquisition.coherent_integration_time_ms", std::to_string(integration_time_ms)); - config->set_property("Acquisition.tong_init_val", "5"); - config->set_property("Acquisition.tong_max_val", "10"); + config->set_property("Acquisition.tong_init_val", "1"); + config->set_property("Acquisition.tong_max_val", "8"); config->set_property("Acquisition.implementation", "Galileo_E1_PCPS_Tong_Ambiguous_Acquisition"); - config->set_property("Acquisition.threshold", "0.0005"); + config->set_property("Acquisition.threshold", "0.00028"); // Pfa,a = 0.1 config->set_property("Acquisition.doppler_max", "10000"); config->set_property("Acquisition.doppler_step", "250"); config->set_property("Acquisition.dump", "false"); @@ -536,7 +536,7 @@ TEST_F(GalileoE1PcpsTongAmbiguousAcquisitionGSoC2013Test, ValidationOfResultsPro top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0); }) << "Failure connecting the blocks of acquisition test." << std::endl; - std::cout << "Probability of false alarm (target) = " << 0.0 << std::endl; + std::cout << "Probability of false alarm (target) = " << 0.1 << std::endl; // i = 0 --> sallite in acquisition is visible (prob of detection and prob of detection with wrong estimation) // i = 1 --> satellite in acquisition is not visible (prob of false detection) diff --git a/src/tests/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc index 2a8c4e009..6c2ea1833 100644 --- a/src/tests/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc +++ b/src/tests/gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc @@ -238,13 +238,13 @@ void GpsL1CaPcpsAcquisitionGSoC2013Test::config_2() config->set_property("SignalSource.doppler_Hz_1", "1000"); config->set_property("SignalSource.delay_chips_1", "100"); - config->set_property("SignalSource.system_2", "E"); + config->set_property("SignalSource.system_2", "G"); config->set_property("SignalSource.PRN_2", "21"); config->set_property("SignalSource.CN0_dB_2", "44"); config->set_property("SignalSource.doppler_Hz_2", "2000"); config->set_property("SignalSource.delay_chips_2", "200"); - config->set_property("SignalSource.system_3", "E"); + config->set_property("SignalSource.system_3", "G"); config->set_property("SignalSource.PRN_3", "22"); config->set_property("SignalSource.CN0_dB_3", "44"); config->set_property("SignalSource.doppler_Hz_3", "3000"); @@ -279,7 +279,7 @@ void GpsL1CaPcpsAcquisitionGSoC2013Test::config_2() std::to_string(integration_time_ms)); config->set_property("Acquisition.max_dwells", "1"); config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_Acquisition"); - config->set_property("Acquisition.pfa", "1e-1"); + config->set_property("Acquisition.pfa", "0.1"); config->set_property("Acquisition.doppler_max", "10000"); config->set_property("Acquisition.doppler_step", "250"); config->set_property("Acquisition.bit_transition_flag", "false"); diff --git a/src/tests/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc index 4a1917af5..a65aa39c3 100644 --- a/src/tests/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc +++ b/src/tests/gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc @@ -1,5 +1,5 @@ /*! - * \file gps_l1_ca_pcps_acquisition_gsoc2013_test.cc + * \file gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc * \brief This class implements an acquisition test for * GpsL1CaPcpsMultithreadAcquisition class. * \author Marc Molina, 2013. marc.molina.pena(at)gmail.com @@ -205,7 +205,7 @@ void GpsL1CaPcpsMultithreadAcquisitionGSoC2013Test::config_2() std::string signal = "1C"; signal.copy(gnss_synchro.Signal,2,0); - integration_time_ms = 4; + integration_time_ms = 1; fs_in = 4e6; expected_delay_chips = 600; @@ -237,13 +237,13 @@ void GpsL1CaPcpsMultithreadAcquisitionGSoC2013Test::config_2() config->set_property("SignalSource.doppler_Hz_1", "1000"); config->set_property("SignalSource.delay_chips_1", "100"); - config->set_property("SignalSource.system_2", "E"); + config->set_property("SignalSource.system_2", "G"); config->set_property("SignalSource.PRN_2", "21"); config->set_property("SignalSource.CN0_dB_2", "44"); config->set_property("SignalSource.doppler_Hz_2", "2000"); config->set_property("SignalSource.delay_chips_2", "200"); - config->set_property("SignalSource.system_3", "E"); + config->set_property("SignalSource.system_3", "G"); config->set_property("SignalSource.PRN_3", "22"); config->set_property("SignalSource.CN0_dB_3", "44"); config->set_property("SignalSource.doppler_Hz_3", "3000"); @@ -278,10 +278,10 @@ void GpsL1CaPcpsMultithreadAcquisitionGSoC2013Test::config_2() std::to_string(integration_time_ms)); config->set_property("Acquisition.max_dwells", "1"); config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_Multithread_Acquisition"); - config->set_property("Acquisition.pfa", "1e-1"); + config->set_property("Acquisition.pfa", "0.1"); config->set_property("Acquisition.doppler_max", "10000"); config->set_property("Acquisition.doppler_step", "250"); - config->set_property("Acquisition.bit_transition_flag", "true"); + config->set_property("Acquisition.bit_transition_flag", "false"); config->set_property("Acquisition.dump", "false"); } diff --git a/src/tests/gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc new file mode 100644 index 000000000..a84013571 --- /dev/null +++ b/src/tests/gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc @@ -0,0 +1,579 @@ +/*! + * \file gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc + * \brief This class implements an acquisition test for + * GpsL1CaPcpsOpenClAcquisition class. + * \author Marc Molina, 2013. marc.molina.pena(at)gmail.com + * + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2012 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gnss_block_interface.h" +#include "in_memory_configuration.h" +#include "configuration_interface.h" +#include "gnss_synchro.h" +#include "gps_l1_ca_pcps_opencl_acquisition.h" +#include "signal_generator.h" +//#include "signal_generator.cc" +#include "signal_generator_c.h" +//#include "signal_generator_c.cc" +#include "fir_filter.h" +#include "gen_signal_source.h" +#include "gnss_sdr_valve.h" +#include "boost/shared_ptr.hpp" + + +class GpsL1CaPcpsOpenClAcquisitionGSoC2013Test: public ::testing::Test +{ +protected: + GpsL1CaPcpsOpenClAcquisitionGSoC2013Test() + { + queue = gr::msg_queue::make(0); + top_block = gr::make_top_block("Acquisition test"); + item_size = sizeof(gr_complex); + stop = false; + message = 0; + } + + ~GpsL1CaPcpsOpenClAcquisitionGSoC2013Test() + { + } + + void init(); + void config_1(); + void config_2(); + void start_queue(); + void wait_message(); + void process_message(); + void stop_queue(); + + gr::msg_queue::sptr queue; + gr::top_block_sptr top_block; + GpsL1CaPcpsOpenClAcquisition *acquisition; + InMemoryConfiguration* config; + Gnss_Synchro gnss_synchro; + size_t item_size; + concurrent_queue channel_internal_queue; + bool stop; + int message; + boost::thread ch_thread; + + unsigned int integration_time_ms; + unsigned int fs_in; + + double expected_delay_chips; + double expected_doppler_hz; + float max_doppler_error_hz; + float max_delay_error_chips; + + unsigned int num_of_realizations; + unsigned int realization_counter; + unsigned int detection_counter; + unsigned int correct_estimation_counter; + unsigned int acquired_samples; + unsigned int mean_acq_time_us; + + double mse_doppler; + double mse_delay; + + double Pd; + double Pfa_p; + double Pfa_a; +}; + +void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::init() +{ + message = 0; + realization_counter = 0; + detection_counter = 0; + correct_estimation_counter = 0; + acquired_samples = 0; + mse_doppler = 0; + mse_delay = 0; + mean_acq_time_us = 0; + Pd = 0; + Pfa_p = 0; + Pfa_a = 0; +} + +void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::config_1() +{ + gnss_synchro.Channel_ID = 0; + gnss_synchro.System = 'G'; + std::string signal = "1C"; + signal.copy(gnss_synchro.Signal,2,0); + + integration_time_ms = 1; + fs_in = 4e6; + + expected_delay_chips = 600; + expected_doppler_hz = 750; + max_doppler_error_hz = 2/(3*integration_time_ms*1e-3); + max_delay_error_chips = 0.50; + + num_of_realizations = 1; + + config = new InMemoryConfiguration(); + + config->set_property("GNSS-SDR.internal_fs_hz", std::to_string(fs_in)); + + config->set_property("SignalSource.fs_hz", std::to_string(fs_in)); + + config->set_property("SignalSource.item_type", "gr_complex"); + + config->set_property("SignalSource.num_satellites", "1"); + + config->set_property("SignalSource.system_0", "G"); + config->set_property("SignalSource.PRN_0", "10"); + config->set_property("SignalSource.CN0_dB_0", "44"); + config->set_property("SignalSource.doppler_Hz_0", std::to_string(expected_doppler_hz)); + config->set_property("SignalSource.delay_chips_0", std::to_string(expected_delay_chips)); + + config->set_property("SignalSource.noise_flag", "false"); + config->set_property("SignalSource.data_flag", "false"); + config->set_property("SignalSource.BW_BB", "0.97"); + + config->set_property("InputFilter.implementation", "Fir_Filter"); + config->set_property("InputFilter.input_item_type", "gr_complex"); + config->set_property("InputFilter.output_item_type", "gr_complex"); + config->set_property("InputFilter.taps_item_type", "float"); + config->set_property("InputFilter.number_of_taps", "11"); + config->set_property("InputFilter.number_of_bands", "2"); + config->set_property("InputFilter.band1_begin", "0.0"); + config->set_property("InputFilter.band1_end", "0.97"); + config->set_property("InputFilter.band2_begin", "0.98"); + config->set_property("InputFilter.band2_end", "1.0"); + config->set_property("InputFilter.ampl1_begin", "1.0"); + config->set_property("InputFilter.ampl1_end", "1.0"); + config->set_property("InputFilter.ampl2_begin", "0.0"); + config->set_property("InputFilter.ampl2_end", "0.0"); + config->set_property("InputFilter.band1_error", "1.0"); + config->set_property("InputFilter.band2_error", "1.0"); + config->set_property("InputFilter.filter_type", "bandpass"); + config->set_property("InputFilter.grid_density", "16"); + + config->set_property("Acquisition.item_type", "gr_complex"); + config->set_property("Acquisition.if", "0"); + config->set_property("Acquisition.coherent_integration_time_ms", + std::to_string(integration_time_ms)); + config->set_property("Acquisition.max_dwells", "1"); + config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_OpenCl_Acquisition"); + config->set_property("Acquisition.threshold", "0.8"); + config->set_property("Acquisition.doppler_max", "10000"); + config->set_property("Acquisition.doppler_step", "250"); + config->set_property("Acquisition.bit_transition_flag", "false"); + config->set_property("Acquisition.dump", "false"); +} + +void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::config_2() +{ + gnss_synchro.Channel_ID = 0; + gnss_synchro.System = 'G'; + std::string signal = "1C"; + signal.copy(gnss_synchro.Signal,2,0); + + integration_time_ms = 1; + fs_in = 4e6; + + expected_delay_chips = 600; + expected_doppler_hz = 750; + max_doppler_error_hz = 2/(3*integration_time_ms*1e-3); + max_delay_error_chips = 0.50; + + num_of_realizations = 100; + + config = new InMemoryConfiguration(); + + config->set_property("GNSS-SDR.internal_fs_hz", std::to_string(fs_in)); + + config->set_property("SignalSource.fs_hz", std::to_string(fs_in)); + + config->set_property("SignalSource.item_type", "gr_complex"); + + config->set_property("SignalSource.num_satellites", "4"); + + config->set_property("SignalSource.system_0", "G"); + config->set_property("SignalSource.PRN_0", "10"); + config->set_property("SignalSource.CN0_dB_0", "44"); + config->set_property("SignalSource.doppler_Hz_0", std::to_string(expected_doppler_hz)); + config->set_property("SignalSource.delay_chips_0", std::to_string(expected_delay_chips)); + + config->set_property("SignalSource.system_1", "G"); + config->set_property("SignalSource.PRN_1", "15"); + config->set_property("SignalSource.CN0_dB_1", "44"); + config->set_property("SignalSource.doppler_Hz_1", "1000"); + config->set_property("SignalSource.delay_chips_1", "100"); + + config->set_property("SignalSource.system_2", "G"); + config->set_property("SignalSource.PRN_2", "21"); + config->set_property("SignalSource.CN0_dB_2", "44"); + config->set_property("SignalSource.doppler_Hz_2", "2000"); + config->set_property("SignalSource.delay_chips_2", "200"); + + config->set_property("SignalSource.system_3", "G"); + config->set_property("SignalSource.PRN_3", "22"); + config->set_property("SignalSource.CN0_dB_3", "44"); + config->set_property("SignalSource.doppler_Hz_3", "3000"); + config->set_property("SignalSource.delay_chips_3", "300"); + + config->set_property("SignalSource.noise_flag", "true"); + config->set_property("SignalSource.data_flag", "true"); + config->set_property("SignalSource.BW_BB", "0.97"); + + config->set_property("InputFilter.implementation", "Fir_Filter"); + config->set_property("InputFilter.input_item_type", "gr_complex"); + config->set_property("InputFilter.output_item_type", "gr_complex"); + config->set_property("InputFilter.taps_item_type", "float"); + config->set_property("InputFilter.number_of_taps", "11"); + config->set_property("InputFilter.number_of_bands", "2"); + config->set_property("InputFilter.band1_begin", "0.0"); + config->set_property("InputFilter.band1_end", "0.97"); + config->set_property("InputFilter.band2_begin", "0.98"); + config->set_property("InputFilter.band2_end", "1.0"); + config->set_property("InputFilter.ampl1_begin", "1.0"); + config->set_property("InputFilter.ampl1_end", "1.0"); + config->set_property("InputFilter.ampl2_begin", "0.0"); + config->set_property("InputFilter.ampl2_end", "0.0"); + config->set_property("InputFilter.band1_error", "1.0"); + config->set_property("InputFilter.band2_error", "1.0"); + config->set_property("InputFilter.filter_type", "bandpass"); + config->set_property("InputFilter.grid_density", "16"); + + config->set_property("Acquisition.item_type", "gr_complex"); + config->set_property("Acquisition.if", "0"); + config->set_property("Acquisition.coherent_integration_time_ms", + std::to_string(integration_time_ms)); + config->set_property("Acquisition.max_dwells", "1"); + config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_OpenCl_Acquisition"); + config->set_property("Acquisition.pfa", "0.1"); + config->set_property("Acquisition.doppler_max", "10000"); + config->set_property("Acquisition.doppler_step", "250"); + config->set_property("Acquisition.bit_transition_flag", "false"); + config->set_property("Acquisition.dump", "false"); +} + +void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::start_queue() +{ + stop = false; + ch_thread = boost::thread(&GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::wait_message, this); +} + +void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::wait_message() +{ + struct timeval tv; + long long int begin = 0; + long long int end = 0; + + while (!stop) + { + acquisition->reset(); + + gettimeofday(&tv, NULL); + begin = tv.tv_sec *1e6 + tv.tv_usec; + + channel_internal_queue.wait_and_pop(message); + + gettimeofday(&tv, NULL); + end = tv.tv_sec *1e6 + tv.tv_usec; + + mean_acq_time_us += (end-begin); + + process_message(); + } +} + +void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::process_message() +{ + if (message == 1) + { + detection_counter++; + + // The term -5 is here to correct the additional delay introduced by the FIR filter + double delay_error_chips = abs((double)expected_delay_chips - (double)(gnss_synchro.Acq_delay_samples-5)*1023.0/((double)fs_in*1e-3)); + double doppler_error_hz = abs(expected_doppler_hz - gnss_synchro.Acq_doppler_hz); + + mse_delay += std::pow(delay_error_chips, 2); + mse_doppler += std::pow(doppler_error_hz, 2); + + if ((delay_error_chips < max_delay_error_chips) && (doppler_error_hz < max_doppler_error_hz)) + { + correct_estimation_counter++; + } + +// std::cout << "Acq delay samples = " << (double)gnss_synchro.Acq_delay_samples << std::endl; +// std::cout << "Acq doppler Hz = " << (double)gnss_synchro.Acq_doppler_hz << std::endl; + } + + realization_counter++; + + std::cout << "Progress: " << round((float)realization_counter/num_of_realizations*100) << "% \r" << std::flush; + + if (realization_counter == num_of_realizations) + { + mse_delay /= num_of_realizations; + mse_doppler /= num_of_realizations; + + Pd = (double)correct_estimation_counter / (double)num_of_realizations; + Pfa_a = (double)detection_counter / (double)num_of_realizations; + Pfa_p = (double)(detection_counter-correct_estimation_counter) / (double)num_of_realizations; + + mean_acq_time_us /= num_of_realizations; + + stop_queue(); + top_block->stop(); + + std::cout << std::endl; + } +} + +void GpsL1CaPcpsOpenClAcquisitionGSoC2013Test::stop_queue() +{ + stop = true; +} + +TEST_F(GpsL1CaPcpsOpenClAcquisitionGSoC2013Test, Instantiate) +{ + config_1(); + acquisition = new GpsL1CaPcpsOpenClAcquisition(config, "Acquisition", 1, 1, queue); + delete acquisition; + delete config; +} + +TEST_F(GpsL1CaPcpsOpenClAcquisitionGSoC2013Test, ConnectAndRun) +{ + int nsamples = floor(fs_in*integration_time_ms*1e-3); + struct timeval tv; + long long int begin = 0; + long long int end = 0; + + config_1(); + acquisition = new GpsL1CaPcpsOpenClAcquisition(config, "Acquisition", 1, 1, queue); + + ASSERT_NO_THROW( { + acquisition->connect(top_block); + boost::shared_ptr source = gr::analog::sig_source_c::make(fs_in, gr::analog::GR_SIN_WAVE, 1000, 1, gr_complex(0)); + boost::shared_ptr valve = gnss_sdr_make_valve(sizeof(gr_complex), nsamples, queue); + top_block->connect(source, 0, valve, 0); + top_block->connect(valve, 0, acquisition->get_left_block(), 0); + }) << "Failure connecting the blocks of acquisition test."<< std::endl; + + EXPECT_NO_THROW( { + gettimeofday(&tv, NULL); + begin = tv.tv_sec *1e6 + tv.tv_usec; + top_block->run(); // Start threads and wait + gettimeofday(&tv, NULL); + end = tv.tv_sec *1e6 + tv.tv_usec; + }) << "Failure running the top_block."<< std::endl; + + std::cout << "Processed " << nsamples << " samples in " << (end-begin) << " microseconds" << std::endl; + + delete acquisition; + delete config; +} + +TEST_F(GpsL1CaPcpsOpenClAcquisitionGSoC2013Test, ValidationOfResults) +{ + config_1(); + + acquisition = new GpsL1CaPcpsOpenClAcquisition(config, "Acquisition", 1, 1, queue); + + ASSERT_NO_THROW( { + acquisition->set_channel(1); + }) << "Failure setting channel."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_gnss_synchro(&gnss_synchro); + }) << "Failure setting gnss_synchro."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_channel_queue(&channel_internal_queue); + }) << "Failure setting channel_internal_queue."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_doppler_max(config->property("Acquisition.doppler_max", 10000)); + }) << "Failure setting doppler_max."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_doppler_step(config->property("Acquisition.doppler_step", 500)); + }) << "Failure setting doppler_step."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_threshold(config->property("Acquisition.threshold", 0.0)); + }) << "Failure setting threshold."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->connect(top_block); + }) << "Failure connecting acquisition to the top_block."<< std::endl; + + acquisition->init(); + + ASSERT_NO_THROW( { + boost::shared_ptr signal_source; + SignalGenerator* signal_generator = new SignalGenerator(config, "SignalSource", 0, 1, queue); + FirFilter* filter = new FirFilter(config, "InputFilter", 1, 1, queue); + signal_source.reset(new GenSignalSource(config, signal_generator, filter, "SignalSource", queue)); + signal_source->connect(top_block); + top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0); + }) << "Failure connecting the blocks of acquisition test." << std::endl; + + // i = 0 --> sallite in acquisition is visible + // i = 1 --> satellite in acquisition is not visible + for (unsigned int i = 0; i < 2; i++) + { + init(); + + if (i == 0) + { + gnss_synchro.PRN = 10; // This satellite is visible + } + else if (i == 1) + { + gnss_synchro.PRN = 20; // This satellite is not visible + } + + acquisition->set_local_code(); + + start_queue(); + + EXPECT_NO_THROW( { + top_block->run(); // Start threads and wait + }) << "Failure running he top_block."<< std::endl; + + if (i == 0) + { + EXPECT_EQ(1, message) << "Acquisition failure. Expected message: 1=ACQ SUCCESS."; + if (message == 1) + { + EXPECT_EQ((unsigned int)1, correct_estimation_counter) << "Acquisition failure. Incorrect parameters estimation."; + } + + } + else if (i == 1) + { + EXPECT_EQ(2, message) << "Acquisition failure. Expected message: 2=ACQ FAIL."; + } + } + + delete acquisition; + delete config; +} + +TEST_F(GpsL1CaPcpsOpenClAcquisitionGSoC2013Test, ValidationOfResultsProbabilities) +{ + config_2(); + + acquisition = new GpsL1CaPcpsOpenClAcquisition(config, "Acquisition", 1, 1, queue); + + ASSERT_NO_THROW( { + acquisition->set_channel(1); + }) << "Failure setting channel."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_gnss_synchro(&gnss_synchro); + }) << "Failure setting gnss_synchro."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_channel_queue(&channel_internal_queue); + }) << "Failure setting channel_internal_queue."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_doppler_max(config->property("Acquisition.doppler_max", 10000)); + }) << "Failure setting doppler_max."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_doppler_step(config->property("Acquisition.doppler_step", 500)); + }) << "Failure setting doppler_step."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->set_threshold(config->property("Acquisition.threshold", 0.0)); + }) << "Failure setting threshold."<< std::endl; + + ASSERT_NO_THROW( { + acquisition->connect(top_block); + }) << "Failure connecting acquisition to the top_block."<< std::endl; + + acquisition->init(); + + ASSERT_NO_THROW( { + boost::shared_ptr signal_source; + SignalGenerator* signal_generator = new SignalGenerator(config, "SignalSource", 0, 1, queue); + FirFilter* filter = new FirFilter(config, "InputFilter", 1, 1, queue); + signal_source.reset(new GenSignalSource(config, signal_generator, filter, "SignalSource", queue)); + signal_source->connect(top_block); + top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0); + }) << "Failure connecting the blocks of acquisition test." << std::endl; + + std::cout << "Probability of false alarm (target) = " << 0.1 << std::endl; + + // i = 0 --> sallite in acquisition is visible (prob of detection and prob of detection with wrong estimation) + // i = 1 --> satellite in acquisition is not visible (prob of false detection) + for (unsigned int i = 0; i < 2; i++) + { + init(); + + if (i == 0) + { + gnss_synchro.PRN = 10; // This satellite is visible + } + else if (i == 1) + { + gnss_synchro.PRN = 20; // This satellite is not visible + } + + acquisition->set_local_code(); + + start_queue(); + + EXPECT_NO_THROW( { + top_block->run(); // Start threads and wait + }) << "Failure running he top_block."<< std::endl; + + if (i == 0) + { + std::cout << "Probability of detection = " << Pd << std::endl; + std::cout << "Probability of false alarm (satellite present) = " << Pfa_p << std::endl; +// std::cout << "Mean acq time = " << mean_acq_time_us << " microseconds." << std::endl; + } + else if (i == 1) + { + std::cout << "Probability of false alarm (satellite absent) = " << Pfa_a << std::endl; +// std::cout << "Mean acq time = " << mean_acq_time_us << " microseconds." << std::endl; + } + } + + delete acquisition; + delete config; +} diff --git a/src/tests/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc b/src/tests/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc index a906326bd..71ad622bd 100644 --- a/src/tests/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc +++ b/src/tests/gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc @@ -191,8 +191,8 @@ void GpsL1CaPcpsTongAcquisitionGSoC2013Test::config_1() std::to_string(integration_time_ms)); config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_Tong_Acquisition"); config->set_property("Acquisition.threshold", "0.8"); - config->set_property("Acquisition.tong_init_val", "5"); - config->set_property("Acquisition.tong_max_val", "10"); + config->set_property("Acquisition.tong_init_val", "1"); + config->set_property("Acquisition.tong_max_val", "8"); config->set_property("Acquisition.doppler_max", "10000"); config->set_property("Acquisition.doppler_step", "250"); config->set_property("Acquisition.dump", "false"); @@ -237,14 +237,14 @@ void GpsL1CaPcpsTongAcquisitionGSoC2013Test::config_2() config->set_property("SignalSource.doppler_Hz_1", "1000"); config->set_property("SignalSource.delay_chips_1", "100"); - config->set_property("SignalSource.system_2", "E"); - config->set_property("SignalSource.PRN_2", "10"); + config->set_property("SignalSource.system_2", "G"); + config->set_property("SignalSource.PRN_2", "21"); config->set_property("SignalSource.CN0_dB_2", "44"); config->set_property("SignalSource.doppler_Hz_2", "2000"); config->set_property("SignalSource.delay_chips_2", "200"); - config->set_property("SignalSource.system_3", "E"); - config->set_property("SignalSource.PRN_3", "20"); + config->set_property("SignalSource.system_3", "G"); + config->set_property("SignalSource.PRN_3", "22"); config->set_property("SignalSource.CN0_dB_3", "44"); config->set_property("SignalSource.doppler_Hz_3", "3000"); config->set_property("SignalSource.delay_chips_3", "300"); @@ -277,9 +277,9 @@ void GpsL1CaPcpsTongAcquisitionGSoC2013Test::config_2() config->set_property("Acquisition.coherent_integration_time_ms", std::to_string(integration_time_ms)); config->set_property("Acquisition.implementation", "GPS_L1_CA_PCPS_Tong_Acquisition"); - config->set_property("Acquisition.threshold", "0.002"); - config->set_property("Acquisition.tong_init_val", "5"); - config->set_property("Acquisition.tong_max_val", "10"); + config->set_property("Acquisition.threshold", "0.00108"); // Pfa,a = 0.1 + config->set_property("Acquisition.tong_init_val", "1"); + config->set_property("Acquisition.tong_max_val", "8"); config->set_property("Acquisition.doppler_max", "10000"); config->set_property("Acquisition.doppler_step", "250"); config->set_property("Acquisition.dump", "false"); @@ -532,7 +532,7 @@ TEST_F(GpsL1CaPcpsTongAcquisitionGSoC2013Test, ValidationOfResultsProbabilities) top_block->connect(signal_source->get_right_block(), 0, acquisition->get_left_block(), 0); }) << "Failure connecting the blocks of acquisition test." << std::endl; - std::cout << "Probability of false alarm (target) = " << 0.0 << std::endl; + std::cout << "Probability of false alarm (target) = " << 0.1 << std::endl; // i = 0 --> sallite in acquisition is visible (prob of detection and prob of detection with wrong estimation) // i = 1 --> satellite in acquisition is not visible (prob of false detection) diff --git a/src/tests/test_main.cc b/src/tests/test_main.cc index d702592ab..d11f325a8 100644 --- a/src/tests/test_main.cc +++ b/src/tests/test_main.cc @@ -73,6 +73,9 @@ //#include "gnss_block/gps_l1_ca_pcps_acquisition_test.cc" #include "gnss_block/gps_l1_ca_pcps_acquisition_gsoc2013_test.cc" #include "gnss_block/gps_l1_ca_pcps_multithread_acquisition_gsoc2013_test.cc" +#if OPENCL + #include "gnss_block/gps_l1_ca_pcps_opencl_acquisition_gsoc2013_test.cc" +#endif #include "gnss_block/gps_l1_ca_pcps_tong_acquisition_gsoc2013_test.cc" //#include "gnss_block/galileo_e1_pcps_ambiguous_acquisition_test.cc" //#include "gnss_block/galileo_e1_pcps_ambiguous_acquisition_gsoc_test.cc"